diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 77 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 13 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2965 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 846 |
4 files changed, 2258 insertions, 1643 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index c01740b27d59..c83b3f7b225b 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -494,13 +494,13 @@ namespace { if (StoreInst *SI = dyn_cast<StoreInst>(I)) { // For stores, it is the value type, not the pointer type that matters // because the value is what will come from a vector register. - + Value *IVal = SI->getValueOperand(); T1 = IVal->getType(); } else { T1 = I->getType(); } - + if (CastInst *CI = dyn_cast<CastInst>(I)) T2 = CI->getSrcTy(); else @@ -547,10 +547,11 @@ namespace { // Returns the cost of the provided instruction using TTI. // This does not handle loads and stores. unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, - TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue) { + TargetTransformInfo::OK_AnyValue, + const Instruction *I = nullptr) { switch (Opcode) { default: break; case Instruction::GetElementPtr: @@ -584,7 +585,7 @@ namespace { case Instruction::Select: case Instruction::ICmp: case Instruction::FCmp: - return TTI->getCmpSelInstrCost(Opcode, T1, T2); + return TTI->getCmpSelInstrCost(Opcode, T1, T2, I); case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -598,7 +599,7 @@ namespace { case Instruction::FPTrunc: case Instruction::BitCast: case Instruction::ShuffleVector: - return TTI->getCastInstrCost(Opcode, T1, T2); + return TTI->getCastInstrCost(Opcode, T1, T2, I); } return 1; @@ -894,7 +895,7 @@ namespace { // vectors that has a scalar condition results in a malformed select. // FIXME: We could probably be smarter about this by rewriting the select // with different types instead. - return (SI->getCondition()->getType()->isVectorTy() == + return (SI->getCondition()->getType()->isVectorTy() == SI->getTrueValue()->getType()->isVectorTy()); } else if (isa<CmpInst>(I)) { if (!Config.VectorizeCmp) @@ -1044,14 +1045,14 @@ namespace { return false; } } else if (TTI) { - unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); - unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); - Type *VT1 = getVecTypeForPair(IT1, JT1), - *VT2 = getVecTypeForPair(IT2, JT2); TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); // On some targets (example X86) the cost of a vector shift may vary // depending on whether the second operand is a Uniform or @@ -1090,7 +1091,7 @@ namespace { // but this cost is ignored (because insert and extract element // instructions are assigned a zero depth factor and are not really // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I); if (VCost > ICost + JCost) return false; @@ -1127,39 +1128,51 @@ namespace { FastMathFlags FMFCI; if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI)) FMFCI = FPMOCI->getFastMathFlags(); + SmallVector<Value *, 4> IArgs(CI->arg_operands()); + unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); - SmallVector<Type*, 4> Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CI->getArgOperand(i)->getType()); - unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI); - - Tys.clear(); CallInst *CJ = cast<CallInst>(J); FastMathFlags FMFCJ; if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ)) FMFCJ = FPMOCJ->getFastMathFlags(); - for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CJ->getArgOperand(i)->getType()); - unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ); + SmallVector<Value *, 4> JArgs(CJ->arg_operands()); + unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); - Tys.clear(); assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ"); + SmallVector<Type*, 4> Tys; + SmallVector<Value *, 4> VecArgs; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && i == 1) + IID == Intrinsic::cttz) && i == 1) { Tys.push_back(CI->getArgOperand(i)->getType()); - else + VecArgs.push_back(CI->getArgOperand(i)); + } + else { Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), CJ->getArgOperand(i)->getType())); + // Add both operands, and then count their scalarization overhead + // with VF 1. + VecArgs.push_back(CI->getArgOperand(i)); + VecArgs.push_back(CJ->getArgOperand(i)); + } } + // Compute the scalarization cost here with the original operands (to + // check for uniqueness etc), and then call getIntrinsicInstrCost() + // with the constructed vector types. + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); + FastMathFlags FMFV = FMFCI; FMFV &= FMFCJ; - Type *RetTy = getVecTypeForPair(IT1, JT1); - unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV); + unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, + ScalarizationCost); if (VCost > ICost + JCost) return false; @@ -2502,7 +2515,7 @@ namespace { if (I2 == I1 || isa<UndefValue>(I2)) I2 = nullptr; } - + if (HEE) { Value *I3 = HEE->getOperand(0); if (!I2 && I3 != I1) @@ -2693,14 +2706,14 @@ namespace { // so extend the smaller vector to be the same length as the larger one. Instruction *NLOp; if (numElemL > 1) { - + std::vector<Constant *> Mask(numElemH); unsigned v = 0; for (; v < numElemL; ++v) Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); for (; v < numElemH; ++v) Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - + NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), ConstantVector::get(Mask), getReplacementName(IBeforeJ ? I : J, @@ -2710,7 +2723,7 @@ namespace { getReplacementName(IBeforeJ ? I : J, true, o, 1)); } - + NLOp->insertBefore(IBeforeJ ? J : I); LOp = NLOp; } @@ -2720,7 +2733,7 @@ namespace { if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeH, VArgType, IBeforeJ)) { Instruction *S = - InsertElementInst::Create(LOp, HOp, + InsertElementInst::Create(LOp, HOp, ConstantInt::get(Type::getInt32Ty(Context), numElemL), getReplacementName(IBeforeJ ? I : J, @@ -2737,7 +2750,7 @@ namespace { Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); for (; v < numElemL; ++v) Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); - + NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), ConstantVector::get(Mask), getReplacementName(IBeforeJ ? I : J, diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index c44a393cf846..4409d7a404f8 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBytes = ElementSizeBits / 8; unsigned SizeBytes = ElementSizeBytes * Chain.size(); unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; - if (NumLeft == Chain.size()) - --NumLeft; - else if (NumLeft == 0) + if (NumLeft == Chain.size()) { + if ((NumLeft & 1) == 0) + NumLeft /= 2; // Split even in half + else + --NumLeft; // Split off last element + } else if (NumLeft == 0) NumLeft = 1; return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } @@ -588,7 +591,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Make sure all the users of a vector are constant-index extracts. - if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) { + if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) { const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) @@ -622,7 +625,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { if (TySize > VecRegSize / 2) continue; - if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) { + if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) { const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index dac7032fa08f..595b2ec88943 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -50,6 +50,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -92,6 +93,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" @@ -266,21 +268,6 @@ static bool hasCyclesInLoopBody(const Loop &L) { return false; } -/// \brief This modifies LoopAccessReport to initialize message with -/// loop-vectorizer-specific part. -class VectorizationReport : public LoopAccessReport { -public: - VectorizationReport(Instruction *I = nullptr) - : LoopAccessReport("loop not vectorized: ", I) {} - - /// \brief This allows promotion of the loop-access analysis report into the - /// loop-vectorizer report. It modifies the message to add the - /// loop-vectorizer-specific part of the message. - explicit VectorizationReport(const LoopAccessReport &R) - : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), - R.getInstr()) {} -}; - /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -290,31 +277,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -/// A helper function that returns GEP instruction and knows to skip a -/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination -/// pointee types of the 'bitcast' have the same size. -/// For example: -/// bitcast double** %var to i64* - can be skipped -/// bitcast double** %var to i8* - can not -static GetElementPtrInst *getGEPInstruction(Value *Ptr) { - - if (isa<GetElementPtrInst>(Ptr)) - return cast<GetElementPtrInst>(Ptr); - - if (isa<BitCastInst>(Ptr) && - isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { - Type *BitcastTy = Ptr->getType(); - Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); - if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) - return nullptr; - Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); - Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); - const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); - if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) - return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); - } - return nullptr; -} +// FIXME: The following helper functions have multiple implementations +// in the project. They can be effectively organized in a common Load/Store +// utilities unit. /// A helper function that returns the pointer operand of a load or store /// instruction. @@ -326,6 +291,34 @@ static Value *getPointerOperand(Value *I) { return nullptr; } +/// A helper function that returns the type of loaded or stored value. +static Type *getMemInstValueType(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getType(); + return cast<StoreInst>(I)->getValueOperand()->getType(); +} + +/// A helper function that returns the alignment of load or store instruction. +static unsigned getMemInstAlignment(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getAlignment(); + return cast<StoreInst>(I)->getAlignment(); +} + +/// A helper function that returns the address space of the pointer operand of +/// load or store instruction. +static unsigned getMemInstAddressSpace(Value *I) { + assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerAddressSpace(); + return cast<StoreInst>(I)->getPointerAddressSpace(); +} + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. @@ -351,6 +344,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } +/// A helper function that adds a 'fast' flag to floating-point operations. +static Value *addFastMathFlag(Value *V) { + if (isa<FPMathOperator>(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast<Instruction>(V)->setFastMathFlags(Flags); + } + return V; +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { + return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) + : ConstantFP::get(Ty, C); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -428,10 +438,17 @@ protected: /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Handle all cross-iteration phis in the header. + void fixCrossIterationPHIs(); + /// Fix a first-order recurrence. This is the second phase of vectorizing /// this phi node. void fixFirstOrderRecurrence(PHINode *Phi); + /// Fix a reduction cross-iteration phi. This is the second phase of + /// vectorizing this phi node. + void fixReduction(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -448,7 +465,8 @@ protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions(); + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. @@ -462,14 +480,14 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); + /// A helper function to vectorize a single instruction within the innermost + /// loop. + void vectorizeInstruction(Instruction &I); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF, - PhiVector *PV); + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. @@ -504,20 +522,21 @@ protected: /// \p EntryVal is the value from the original loop that maps to the steps. /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal); - - /// Create a vector induction phi node based on an existing scalar one. This - /// currently only works for integer induction variables with a constant - /// step. \p EntryVal is the value from the original loop that maps to the - /// vector phi node. If \p EntryVal is a truncate instruction, instead of - /// widening the original IV, we widen a version of the IV truncated to \p - /// EntryVal's type. - void createVectorIntInductionPHI(const InductionDescriptor &II, - Instruction *EntryVal); - - /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. - void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + const InductionDescriptor &ID); + + /// Create a vector induction phi node based on an existing scalar one. \p + /// EntryVal is the value from the original loop that maps to the vector phi + /// node, and \p Step is the loop-invariant step. If \p EntryVal is a + /// truncate instruction, instead of widening the original IV, we widen a + /// version of the IV truncated to \p EntryVal's type. + void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, + Value *Step, Instruction *EntryVal); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -583,6 +602,10 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// \brief Set the debug location in the builder using the debug location in + /// the instruction. + void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// This is a helper class for maintaining vectorization state. It's used for /// mapping values from the original loop to their corresponding values in /// the new loop. Two mappings are maintained: one for vectorized values and @@ -777,14 +800,6 @@ protected: // Record whether runtime checks are added. bool AddedSafetyChecks; - // Holds instructions from the original loop whose counterparts in the - // vectorized loop would be trivially dead if generated. For example, - // original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; @@ -803,8 +818,6 @@ public: UnrollFactor, LVL, CM) {} private: - void scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -832,12 +845,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { return I; } -/// \brief Set the debug location in the builder using the debug location in the -/// instruction. -static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) - B.SetCurrentDebugLocation(Inst->getDebugLoc()); - else +void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { + const DILocation *DIL = Inst->getDebugLoc(); + if (DIL && Inst->getFunction()->isDebugInfoForProfiling()) + B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF)); + else + B.SetCurrentDebugLocation(DIL); + } else B.SetCurrentDebugLocation(DebugLoc()); } @@ -1497,14 +1512,6 @@ private: OptimizationRemarkEmitter &ORE; }; -static void emitAnalysisDiag(const Loop *TheLoop, - const LoopVectorizeHints &Hints, - OptimizationRemarkEmitter &ORE, - const LoopAccessReport &Message) { - const char *Name = Hints.vectorizeAnalysisPassName(); - LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); -} - static void emitMissedWarning(Function *F, Loop *L, const LoopVectorizeHints &LH, OptimizationRemarkEmitter *ORE) { @@ -1512,13 +1519,17 @@ static void emitMissedWarning(Function *F, Loop *L, if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { if (LH.getWidth() != 1) - emitLoopVectorizeWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop vectorization"); + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedVectorization", + L->getStartLoc(), L->getHeader()) + << "loop not vectorized: " + << "failed explicitly specified loop vectorization"); else if (LH.getInterleave() != 1) - emitLoopInterleaveWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop interleaving"); + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(), + L->getHeader()) + << "loop not interleaved: " + << "failed explicitly specified loop interleaving"); } } @@ -1546,7 +1557,7 @@ public: LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all @@ -1566,8 +1577,8 @@ public: /// loop, only that it is legal to do so. bool canVectorize(); - /// Returns the Induction variable. - PHINode *getInduction() { return Induction; } + /// Returns the primary induction variable. + PHINode *getPrimaryInduction() { return PrimaryInduction; } /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } @@ -1607,12 +1618,6 @@ public: /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } - - /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } - /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1689,15 +1694,9 @@ public: /// instructions that may divide by zero. bool isScalarWithPredication(Instruction *I); - /// Returns true if \p I is a memory instruction that has a consecutive or - /// consecutive-like pointer operand. Consecutive-like pointers are pointers - /// that are treated like consecutive pointers during vectorization. The - /// pointer operands of interleaved accesses are an example. - bool hasConsecutiveLikePtrOperand(Instruction *I); - - /// Returns true if \p I is a memory instruction that must be scalarized - /// during vectorization. - bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); private: /// Check if a single basic block loop is vectorizable. @@ -1715,24 +1714,6 @@ private: /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the instructions that are uniform after vectorization. An - /// instruction is uniform if we represent it with a single scalar value in - /// the vectorized loop corresponding to each vector iteration. Examples of - /// uniform instructions include pointer operands of consecutive or - /// interleaved memory accesses. Note that although uniformity implies an - /// instruction will be scalar, the reverse is not true. In general, a - /// scalarized instruction will be represented by VF scalar values in the - /// vectorized loop, each corresponding to an iteration of the original - /// scalar loop. - void collectLoopUniforms(); - - /// Collect the instructions that are scalar after vectorization. An - /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. - void collectLoopScalars(); - /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1744,14 +1725,6 @@ private: void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, SmallPtrSetImpl<Value *> &AllowedExit); - /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. These are handled as LoopAccessReport rather than - /// VectorizationReport because the << operator of VectorizationReport returns - /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); - } - /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. If \p I is passed it is @@ -1804,9 +1777,9 @@ private: // --- vectorization state --- // - /// Holds the integer induction variable. This is the counter of the + /// Holds the primary induction variable. This is the counter of the /// loop. - PHINode *Induction; + PHINode *PrimaryInduction; /// Holds the reduction variables. ReductionList Reductions; /// Holds all of the induction variables that we found in the loop. @@ -1822,12 +1795,6 @@ private: /// vars which can be accessed from outside the loop. SmallPtrSet<Value *, 4> AllowedExit; - /// Holds the instructions known to be uniform after vectorization. - SmallPtrSet<Instruction *, 4> Uniforms; - - /// Holds the instructions known to be scalar after vectorization. - SmallPtrSet<Instruction *, 4> Scalars; - /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1861,16 +1828,26 @@ public: : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + /// \return An upper bound for the vectorization factor, or None if + /// vectorization should be avoided up front. + Optional<unsigned> computeMaxVF(bool OptForSize); + /// Information about vectorization costs struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width }; /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to VF. If UserVF is not ZERO + /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(bool OptForSize); + VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + + /// Setup cost-based decisions for user vectorization factor. + void selectUserVectorizationFactor(unsigned UserVF) { + collectUniformsAndScalars(UserVF); + collectInstsToScalarize(UserVF); + } /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1884,6 +1861,15 @@ public: unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); + /// Memory access instruction may be vectorized in more than one way. + /// Form of instruction after vectorization depends on cost. + /// This function takes cost-based decisions for Load/Store instructions + /// and collects them in a map. This decisions map is used for building + /// the lists of loop-uniform and loop-scalar instructions. + /// The calculated cost is saved with widening decision in order to + /// avoid redundant calculations. + void setCostBasedWideningDecision(unsigned VF); + /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1918,14 +1904,118 @@ public: return Scalars->second.count(I); } + /// Returns true if \p I is known to be uniform after vectorization. + bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); + auto UniformsPerVF = Uniforms.find(VF); + return UniformsPerVF->second.count(I); + } + + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); + auto ScalarsPerVF = Scalars.find(VF); + return ScalarsPerVF->second.count(I); + } + /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && - !Legal->isScalarAfterVectorization(I); + !isScalarAfterVectorization(I, VF); + } + + /// Decision that was taken during cost calculation for memory instruction. + enum InstWidening { + CM_Unknown, + CM_Widen, + CM_Interleave, + CM_GatherScatter, + CM_Scalarize + }; + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// instruction \p I and vector width \p VF. + void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + } + + /// Save vectorization decision \p W and \p Cost taken by the cost model for + /// interleaving group \p Grp and vector width \p VF. + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + InstWidening W, unsigned Cost) { + assert(VF >= 2 && "Expected VF >=2"); + /// Broadcast this decicion to all instructions inside the group. + /// But the cost will be assigned to one instruction only. + for (unsigned i = 0; i < Grp->getFactor(); ++i) { + if (auto *I = Grp->getMember(i)) { + if (Grp->getInsertPos() == I) + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + else + WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + } + } + } + + /// Return the cost model decision for the given instruction \p I and vector + /// width \p VF. Return CM_Unknown if this instruction did not pass + /// through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + auto Itr = WideningDecisions.find(InstOnVF); + if (Itr == WideningDecisions.end()) + return CM_Unknown; + return Itr->second.first; + } + + /// Return the vectorization cost for the given instruction \p I and vector + /// width \p VF. + unsigned getWideningCost(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); + assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated"); + return WideningDecisions[InstOnVF].second; + } + + /// Return True if instruction \p I is an optimizable truncate whose operand + /// is an induction variable. Such a truncate will be removed by adding a new + /// induction variable with the destination type. + bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + + // If the instruction is not a truncate, return false. + auto *Trunc = dyn_cast<TruncInst>(I); + if (!Trunc) + return false; + + // Get the source and destination types of the truncate. + Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); + Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); + + // If the truncate is free for the given types, return false. Replacing a + // free truncate with an induction variable would add an induction variable + // update instruction to each iteration of the loop. We exclude from this + // check the primary induction variable since it will need an update + // instruction regardless. + Value *Op = Trunc->getOperand(0); + if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) + return false; + + // If the truncated value is not an induction variable, return false. + return Legal->isInductionVariable(Op); } private: + /// \return An upper bound for the vectorization factor, larger than zero. + /// One is returned if vectorization should best be avoided due to cost. + unsigned computeFeasibleMaxVF(bool OptForSize); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -1949,6 +2039,26 @@ private: /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + /// Calculate vectorization cost of memory instruction \p I. + unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + + /// The cost computation for scalarized memory instruction. + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + + /// The cost computation for interleaving group of memory instructions. + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + + /// The cost computation for Gather/Scatter instruction. + unsigned getGatherScatterCost(Instruction *I, unsigned VF); + + /// The cost computation for widening instruction \p I with consecutive + /// memory access. + unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + + /// The cost calculation for Load instruction \p I with uniform pointer - + /// scalar load + broadcast. + unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1972,12 +2082,24 @@ private: /// pairs. typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + /// A set containing all BasicBlocks that are known to present after + /// vectorization as a predicated block. + SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + /// Holds the instructions known to be uniform after vectorization. + /// The data is collected per VF. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; + + /// Holds the instructions known to be scalar after vectorization. + /// The data is collected per VF. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -1990,6 +2112,44 @@ private: /// the loop. void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. + void collectLoopUniforms(unsigned VF); + + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + + /// Keeps cost model vectorization decision and cost for instructions. + /// Right now it is used for memory instructions only. + typedef DenseMap<std::pair<Instruction *, unsigned>, + std::pair<InstWidening, unsigned>> + DecisionList; + + DecisionList WideningDecisions; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2019,6 +2179,23 @@ public: SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; +/// LoopVectorizationPlanner - drives the vectorization process after having +/// passed Legality checks. +class LoopVectorizationPlanner { +public: + LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + + ~LoopVectorizationPlanner() {} + + /// Plan how to best vectorize, return the best VF and its cost. + LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, + unsigned UserVF); + +private: + /// The profitablity analysis. + LoopVectorizationCostModel &CM; +}; + /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2134,8 +2311,6 @@ struct LoopVectorize : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); @@ -2156,7 +2331,7 @@ struct LoopVectorize : public FunctionPass { //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and -// LoopVectorizationCostModel. +// LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -2176,27 +2351,51 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -void InnerLoopVectorizer::createVectorIntInductionPHI( - const InductionDescriptor &II, Instruction *EntryVal) { +void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( + const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { Value *Start = II.getStartValue(); - ConstantInt *Step = II.getConstIntStepValue(); - assert(Step && "Can not widen an IV with a non-constant step"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); auto *TruncType = cast<IntegerType>(EntryVal->getType()); - Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); + Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *SplatStart = Builder.CreateVectorSplat(VF, Start); - Value *SteppedStart = getStepVector(SplatStart, 0, Step); + Value *SteppedStart = + getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = II.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(VF, Mul); Builder.restoreIP(CurrIP); - Value *SplatVF = - ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(), - VF * Step->getSExtValue())); // We may need to add the step a number of times, depending on the unroll // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", @@ -2205,8 +2404,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = cast<Instruction>( - Builder.CreateAdd(LastInduction, SplatVF, "step.add")); + LastInduction = cast<Instruction>(addFastMathFlag( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } VectorLoopValueMap.initVector(EntryVal, Entry); if (isa<TruncInst>(EntryVal)) @@ -2225,7 +2424,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( } bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Legal->isScalarAfterVectorization(I) || + return Cost->isScalarAfterVectorization(I, VF) || Cost->isProfitableToScalarize(I, VF); } @@ -2239,7 +2438,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { + + assert((IV->getType()->isIntegerTy() || IV != OldInduction) && + "Primary induction variable must have an integer type"); auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -2251,9 +2453,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // induction variable. Value *ScalarIV = nullptr; - // The step of the induction. - Value *Step = nullptr; - // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; @@ -2266,45 +2465,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // least one user in the loop that is not widened. auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); - // If the induction variable has a constant integer step value, go ahead and - // get it now. - if (ID.getConstIntStepValue()) - Step = ID.getConstIntStepValue(); + // Generate code for the induction step. Note that induction steps are + // required to be loop-invariant + assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && + "Induction step should be loop invariant"); + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + Value *Step = nullptr; + if (PSE.getSE()->isSCEVable(IV->getType())) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + LoopVectorPreHeader->getTerminator()); + } else { + Step = cast<SCEVUnknown>(ID.getStep())->getValue(); + } // Try to create a new independent vector induction variable. If we can't // create the phi node, we will splat the scalar induction variable in each // loop iteration. - if (VF > 1 && IV->getType() == Induction->getType() && Step && - !shouldScalarizeInstruction(EntryVal)) { - createVectorIntInductionPHI(ID, EntryVal); + if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { + createVectorIntOrFpInductionPHI(ID, Step, EntryVal); VectorizedIV = true; } // If we haven't yet vectorized the induction variable, or if we will create // a scalar one, we need to define the scalar induction variable and step // values. If we were given a truncation type, truncate the canonical - // induction variable and constant step. Otherwise, derive these values from - // the induction descriptor. + // induction variable and step. Otherwise, derive these values from the + // induction descriptor. if (!VectorizedIV || NeedsScalarIV) { + ScalarIV = Induction; + if (IV != OldInduction) { + ScalarIV = IV->getType()->isIntegerTy() + ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) + : Builder.CreateCast(Instruction::SIToFP, Induction, + IV->getType()); + ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); + ScalarIV->setName("offset.idx"); + } if (Trunc) { auto *TruncType = cast<IntegerType>(Trunc->getType()); - assert(Step && "Truncation requires constant integer step"); - auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); - ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); - Step = ConstantInt::getSigned(TruncType, StepInt); - } else { - ScalarIV = Induction; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); - ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); - ScalarIV->setName("offset.idx"); - } - if (!Step) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - &*Builder.GetInsertPoint()); - } + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); + Step = Builder.CreateTrunc(Step, TruncType); } } @@ -2314,7 +2517,8 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + Entry[Part] = + getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.initVector(EntryVal, Entry); if (Trunc) addMetadata(Entry, Trunc); @@ -2327,7 +2531,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // in the loop in the common case prior to InstCombine. We will be trading // one vector extract for each scalar step. if (NeedsScalarIV) - buildScalarSteps(ScalarIV, Step, EntryVal); + buildScalarSteps(ScalarIV, Step, EntryVal, ID); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2387,30 +2591,43 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal) { + Value *EntryVal, + const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && - "Val and Step should have the same integer type"); + assert(ScalarIVTy == Step->getType() && + "Val and Step should have the same type"); + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (ScalarIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF; + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. ScalarParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part].resize(VF); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); - auto *Mul = Builder.CreateMul(StartIdx, Step); - auto *Add = Builder.CreateAdd(ScalarIV, Mul); + auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); + auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); Entry[Part][Lane] = Add; } } @@ -2469,7 +2686,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // known to be uniform after vectorization, this corresponds to lane zero // of the last unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the last unroll iteration. - unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane)); // Set the insert point after the last scalarized instruction. This ensures @@ -2486,7 +2703,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // VectorLoopValueMap, we will only generate the insertelements once. for (unsigned Part = 0; Part < UF; ++Part) { Value *VectorValue = nullptr; - if (Legal->isUniformAfterVectorization(I)) { + if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); } else { VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); @@ -2515,8 +2732,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V)) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? + !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested @@ -2551,102 +2769,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } -// Get a mask to interleave \p NumVec vectors into a wide vector. -// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> -// E.g. For 2 interleaved vectors, if VF is 4, the mask is: -// <0, 4, 1, 5, 2, 6, 3, 7> -static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, - unsigned NumVec) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < VF; i++) - for (unsigned j = 0; j < NumVec; j++) - Mask.push_back(Builder.getInt32(j * VF + i)); - - return ConstantVector::get(Mask); -} - -// Get the strided mask starting from index \p Start. -// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> -static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, - unsigned Stride, unsigned VF) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < VF; i++) - Mask.push_back(Builder.getInt32(Start + i * Stride)); - - return ConstantVector::get(Mask); -} - -// Get a mask of two parts: The first part consists of sequential integers -// starting from 0, The second part consists of UNDEFs. -// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, - unsigned NumUndef) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumInt; i++) - Mask.push_back(Builder.getInt32(i)); - - Constant *Undef = UndefValue::get(Builder.getInt32Ty()); - for (unsigned i = 0; i < NumUndef; i++) - Mask.push_back(Undef); - - return ConstantVector::get(Mask); -} - -// Concatenate two vectors with the same element type. The 2nd vector should -// not have more elements than the 1st vector. If the 2nd vector has less -// elements, extend it with UNDEFs. -static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, - Value *V2) { - VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); - VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); - assert(VecTy1 && VecTy2 && - VecTy1->getScalarType() == VecTy2->getScalarType() && - "Expect two vectors with the same element type"); - - unsigned NumElts1 = VecTy1->getNumElements(); - unsigned NumElts2 = VecTy2->getNumElements(); - assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); - - if (NumElts1 > NumElts2) { - // Extend with UNDEFs. - Constant *ExtMask = - getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); - V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); - } - - Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); - return Builder.CreateShuffleVector(V1, V2, Mask); -} - -// Concatenate vectors in the given list. All vectors have the same type. -static Value *ConcatenateVectors(IRBuilder<> &Builder, - ArrayRef<Value *> InputList) { - unsigned NumVec = InputList.size(); - assert(NumVec > 1 && "Should be at least two vectors"); - - SmallVector<Value *, 8> ResList; - ResList.append(InputList.begin(), InputList.end()); - do { - SmallVector<Value *, 8> TmpList; - for (unsigned i = 0; i < NumVec - 1; i += 2) { - Value *V0 = ResList[i], *V1 = ResList[i + 1]; - assert((V0->getType() == V1->getType() || i == NumVec - 2) && - "Only the last vector may have a different type"); - - TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); - } - - // Push the last vector if the total number of vectors is odd. - if (NumVec % 2 != 0) - TmpList.push_back(ResList[NumVec - 1]); - - ResList = TmpList; - NumVec = ResList.size(); - } while (NumVec > 1); - - return ResList[0]; -} - // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): @@ -2683,15 +2805,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Instr != Group->getInsertPos()) return; - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr)); // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); @@ -2731,7 +2851,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *UndefVec = UndefValue::get(VecTy); // Vectorize the interleaved load group. - if (LI) { + if (isa<LoadInst>(Instr)) { // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; @@ -2752,7 +2872,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { continue; VectorParts Entry(UF); - Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF); + Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); @@ -2796,10 +2916,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { } // Concatenate all vectors into a wide vector. - Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); + Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); @@ -2816,103 +2936,44 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { assert((LI || SI) && "Invalid Load/Store instruction"); - // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + LoopVectorizationCostModel::InstWidening Decision = + Cost->getWideningDecision(Instr, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == LoopVectorizationCostModel::CM_Interleave) return vectorizeInterleaveGroup(Instr); - Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getPointerOperand(Instr); - unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Alignment = getMemInstAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + unsigned AddressSpace = getMemInstAddressSpace(Instr); // Scalarize the memory instruction if necessary. - if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + if (Decision == LoopVectorizationCostModel::CM_Scalarize) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. bool CreateGatherScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); + (Decision == LoopVectorizationCostModel::CM_GatherScatter); VectorParts VectorGep; // Handle consecutive loads/stores. - GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { - if (Gep) { - unsigned NumOperands = Gep->getNumOperands(); -#ifndef NDEBUG - // The original GEP that identified as a consecutive memory access - // should have only one loop-variant operand. - unsigned NumOfLoopVariantOps = 0; - for (unsigned i = 0; i < NumOperands; ++i) - if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), - OrigLoop)) - NumOfLoopVariantOps++; - assert(NumOfLoopVariantOps == 1 && - "Consecutive GEP should have only one loop-variant operand"); -#endif - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setName("gep.indvar"); - - // A new GEP is created for a 0-lane value of the first unroll iteration. - // The GEPs for the rest of the unroll iterations are computed below as an - // offset from this GEP. - for (unsigned i = 0; i < NumOperands; ++i) - // We can apply getScalarValue() for all GEP indices. It returns an - // original value for loop-invariant operand and 0-lane for consecutive - // operand. - Gep2->setOperand(i, getScalarValue(Gep->getOperand(i), - 0, /* First unroll iteration */ - 0 /* 0-lane of the vector */ )); - setDebugLocFromInst(Builder, Gep); - Ptr = Builder.Insert(Gep2); - - } else { // No GEP - setDebugLocFromInst(Builder, Ptr); - Ptr = getScalarValue(Ptr, 0, 0); - } + Ptr = getScalarValue(Ptr, 0, 0); } else { // At this point we should vector version of GEP for Gather or Scatter assert(CreateGatherScatter && "The instruction should be scalarized"); - if (Gep) { - // Vectorizing GEP, across UF parts. We want to get a vector value for base - // and each index that's defined inside the loop, even if it is - // loop-invariant but wasn't hoisted out. Otherwise we want to keep them - // scalar. - SmallVector<VectorParts, 4> OpsV; - for (Value *Op : Gep->operands()) { - Instruction *SrcInst = dyn_cast<Instruction>(Op); - if (SrcInst && OrigLoop->contains(SrcInst)) - OpsV.push_back(getVectorValue(Op)); - else - OpsV.push_back(VectorParts(UF, Op)); - } - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Ops; - Value *GEPBasePtr = OpsV[0][Part]; - for (unsigned i = 1; i < Gep->getNumOperands(); i++) - Ops.push_back(OpsV[i][Part]); - Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep"); - cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds()); - assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); - - NewGep = - Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF)); - VectorGep.push_back(NewGep); - } - } else - VectorGep = getVectorValue(Ptr); + VectorGep = getVectorValue(Ptr); } VectorParts Mask = createBlockInMask(Instr->getParent()); @@ -3027,7 +3088,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { @@ -3038,7 +3099,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Start if-block. Value *Cmp = nullptr; if (IfPredicateInstr) { - Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); + Cmp = Cond[Part]; + if (Cmp->getType()->isVectorTy()) + Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); } @@ -3346,7 +3409,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // - counts from zero, stepping by one // - is the size of the widest induction variable type // then we create a new one. - OldInduction = Legal->getInduction(); + OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. @@ -3543,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, namespace { struct CSEDenseMapInfo { - static bool canHandle(Instruction *I) { + static bool canHandle(const Instruction *I) { return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); } @@ -3553,12 +3616,12 @@ struct CSEDenseMapInfo { static inline Instruction *getTombstoneKey() { return DenseMapInfo<Instruction *>::getTombstoneKey(); } - static unsigned getHashValue(Instruction *I) { + static unsigned getHashValue(const Instruction *I) { assert(canHandle(I) && "Unknown instruction!"); return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), I->value_op_end())); } - static bool isEqual(Instruction *LHS, Instruction *RHS) { + static bool isEqual(const Instruction *LHS, const Instruction *RHS) { if (LHS == getEmptyKey() || RHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getTombstoneKey()) return LHS == RHS; @@ -3589,51 +3652,6 @@ static void cse(BasicBlock *BB) { } } -/// \brief Adds a 'fast' flag to floating point operations. -static Value *addFastMathFlag(Value *V) { - if (isa<FPMathOperator>(V)) { - FastMathFlags Flags; - Flags.setUnsafeAlgebra(); - cast<Instruction>(V)->setFastMathFlags(Flags); - } - return V; -} - -/// \brief Estimate the overhead of scalarizing a value based on its type. -/// Insert and Extract are set if the result needs to be inserted and/or -/// extracted from vectors. -static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, - const TargetTransformInfo &TTI) { - if (Ty->isVoidTy()) - return 0; - - assert(Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; - - for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Extract) - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); - if (Insert) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); - } - - return Cost; -} - -/// \brief Estimate the overhead of scalarizing an Instruction based on the -/// types of its operands and return value. -static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys, - Type *RetTy, - const TargetTransformInfo &TTI) { - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - - for (Type *Ty : OpTys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); - - return ScalarizationCost; -} - /// \brief Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, @@ -3641,14 +3659,24 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, if (VF == 1) return 0; + unsigned Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa<LoadInst>(I) || + !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); - SmallVector<Type *, 4> OpTys; - unsigned OperandsNum = I->getNumOperands(); - for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) - OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + if (CallInst *CI = dyn_cast<CallInst>(I)) { + SmallVector<const Value *, 4> Operands(CI->arg_operands()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + else if (!isa<StoreInst>(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { + SmallVector<const Value *, 4> Operands(I->operand_values()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } - return getScalarizationOverhead(OpTys, RetTy, TTI); + return Cost; } // Estimate cost of a call instruction CI if it were vectorized with factor VF. @@ -3681,7 +3709,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3709,16 +3737,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - FastMathFlags FMF; if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) FMF = FPMO->getFastMathFlags(); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + SmallVector<Value *, 4> Operands(CI->arg_operands()); + return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { @@ -3861,30 +3885,27 @@ void InnerLoopVectorizer::vectorizeLoop() { // the cost-model. // //===------------------------------------------------===// - Constant *Zero = Builder.getInt32(0); - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. First, - // we create a new vector PHI node with no incoming edges. We use this value - // when we vectorize all of the instructions that use the PHI. Next, after - // all of the instructions in the block are complete we add the new incoming - // edges to the PHI. At this point all of the instructions in the basic block - // are vectorized, so we can use them to construct the PHI. - PhiVector PHIsToFix; - - // Collect instructions from the original loop that will become trivially - // dead in the vectorized loop. We don't need to vectorize these - // instructions. - collectTriviallyDeadInstructions(); + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - // Vectorize all of the blocks in the original loop. + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - vectorizeBlockInLoop(BB, &PHIsToFix); + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + vectorizeInstruction(I); // Insert truncates and extends for any truncated instructions as hints to // InstCombine. @@ -3892,224 +3913,10 @@ void InnerLoopVectorizer::vectorizeLoop() { truncateToMinimalBitwidths(); // At this point every instruction in the original loop is widened to a - // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI + // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. // This is the second stage of vectorizing recurrences. - for (PHINode *Phi : PHIsToFix) { - assert(Phi && "Unable to recover vectorized PHI"); - - // Handle first-order recurrences that need to be fixed. - if (Legal->isFirstOrderRecurrence(Phi)) { - fixFirstOrderRecurrence(Phi); - continue; - } - - // If the phi node is not a first-order recurrence, it must be a reduction. - // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(Phi) && - "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; - - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RdxDesc.getMinMaxRecurrenceKind(); - setDebugLocFromInst(Builder, ReductionStartValue); - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and override - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); - - // This is the vector-clone of the value that leaves the loop. - const VectorParts &VectorExit = getVectorValue(LoopExitInst); - Type *VecTy = VectorExit[0]->getType(); - - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Value *Identity; - Value *VectorStart; - if (RK == RecurrenceDescriptor::RK_IntegerMinMax || - RK == RecurrenceDescriptor::RK_FloatMinMax) { - // MinMax reduction have the start value as their identify. - if (VF == 1) { - VectorStart = Identity = ReductionStartValue; - } else { - VectorStart = Identity = - Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); - } - } else { - // Handle other reduction kinds: - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - if (VF == 1) { - Identity = Iden; - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = ReductionStartValue; - } else { - Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = - Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); - } - } - - // Fix the vector-loop phi. - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - const VectorParts &VecRdxPhi = getVectorValue(Phi); - BasicBlock *Latch = OrigLoop->getLoopLatch(); - Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - const VectorParts &Val = getVectorValue(LoopVal); - for (unsigned part = 0; part < UF; ++part) { - // Make sure to add the reduction stat value only to the - // first unroll part. - Value *StartVal = (part == 0) ? VectorStart : Identity; - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(StartVal, LoopVectorPreHeader); - cast<PHINode>(VecRdxPhi[part]) - ->addIncoming(Val[part], LoopVectorBody); - } - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - - VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); - setDebugLocFromInst(Builder, LoopExitInst); - - // If the vector reduction can be performed in a smaller type, we truncate - // then extend the loop exit value to enable InstCombine to evaluate the - // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - Builder.SetInsertPoint(LoopVectorBody->getTerminator()); - for (unsigned part = 0; part < UF; ++part) { - Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); - Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) - : Builder.CreateZExt(Trunc, VecTy); - for (Value::user_iterator UI = RdxParts[part]->user_begin(); - UI != RdxParts[part]->user_end();) - if (*UI != Trunc) { - (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); - RdxParts[part] = Extnd; - } else { - ++UI; - } - } - Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - for (unsigned part = 0; part < UF; ++part) - RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); - } - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); - setDebugLocFromInst(Builder, ReducedPartRdx); - for (unsigned part = 1; part < UF; ++part) { - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], - ReducedPartRdx, "bin.rdx")); - else - ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( - Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); - } - - if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. - ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (Phi->getType() != RdxDesc.getRecurrenceType()) - ReducedPartRdx = - RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) - : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); - } - - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); - LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) - break; - - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - - // We found our reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { - // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - break; - } - } // end of the LCSSA phi scan. - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); - } // end of for each Phi in PHIsToFix. + fixCrossIterationPHIs(); // Update the dominator tree. // @@ -4134,6 +3941,25 @@ void InnerLoopVectorizer::vectorizeLoop() { cse(LoopVectorBody); } +void InnerLoopVectorizer::fixCrossIterationPHIs() { + // In order to support recurrences we need to be able to vectorize Phi nodes. + // Phi nodes have cycles, so we need to vectorize them in two stages. This is + // stage #2: We now need to fix the recurrences by adding incoming edges to + // the currently empty PHI nodes. At this point every instruction in the + // original loop is widened to a vector form so we can use them to construct + // the incoming edges. + for (Instruction &I : *OrigLoop->getHeader()) { + PHINode *Phi = dyn_cast<PHINode>(&I); + if (!Phi) + break; + // Handle first-order recurrences and reductions that need to be fixed. + if (Legal->isFirstOrderRecurrence(Phi)) + fixFirstOrderRecurrence(Phi); + else if (Legal->isReductionVariable(Phi)) + fixReduction(Phi); + } +} + void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // This is the second phase of vectorizing first-order recurrences. An @@ -4212,15 +4038,17 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); - // Get the vectorized previous value. We ensured the previous values was an - // instruction when detecting the recurrence. + // Get the vectorized previous value. auto &PreviousParts = getVectorValue(Previous); - // Set the insertion point to be after this instruction. We ensured the - // previous value dominated all uses of the phi when detecting the - // recurrence. - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); + // Set the insertion point after the previous value if it is an instruction. + // Note that the previous value may have been constant-folded so it is not + // guaranteed to be an instruction in the vector loop. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); + else + Builder.SetInsertPoint( + &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -4251,18 +4079,33 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. - auto *Extract = Incoming; + auto *ExtractForScalar = Incoming; if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), - "vector.recur.extract"); - } + ExtractForScalar = Builder.CreateExtractElement( + ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + } + // Extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the LoopMiddleBlock, + // when the scalar loop is not run at all. + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF > 1) + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of + // `Incoming`. This is analogous to the vectorized case above: extracting the + // second last element when VF > 1. + else if (UF > 1) + ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2]; // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); for (auto *BB : predecessors(LoopScalarPreHeader)) { - auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; + auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; Start->addIncoming(Incoming, BB); } @@ -4279,12 +4122,218 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { if (!LCSSAPhi) break; if (LCSSAPhi->getIncomingValue(0) == Phi) { - LCSSAPhi->addIncoming(Extract, LoopMiddleBlock); + LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); break; } } } +void InnerLoopVectorizer::fixReduction(PHINode *Phi) { + Constant *Zero = Builder.getInt32(0); + + // Get it's reduction variable descriptor. + assert(Legal->isReductionVariable(Phi) && + "Unable to find the reduction variable"); + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; + + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RdxDesc.getMinMaxRecurrenceKind(); + setDebugLocFromInst(Builder, ReductionStartValue); + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and override + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + const VectorParts &VectorExit = getVectorValue(LoopExitInst); + Type *VecTy = VectorExit[0]->getType(); + + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Value *Identity; + Value *VectorStart; + if (RK == RecurrenceDescriptor::RK_IntegerMinMax || + RK == RecurrenceDescriptor::RK_FloatMinMax) { + // MinMax reduction have the start value as their identify. + if (VF == 1) { + VectorStart = Identity = ReductionStartValue; + } else { + VectorStart = Identity = + Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); + } + } else { + // Handle other reduction kinds: + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); + if (VF == 1) { + Identity = Iden; + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = ReductionStartValue; + } else { + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = + Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); + } + } + + // Fix the vector-loop phi. + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + const VectorParts &VecRdxPhi = getVectorValue(Phi); + BasicBlock *Latch = OrigLoop->getLoopLatch(); + Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + const VectorParts &Val = getVectorValue(LoopVal); + for (unsigned part = 0; part < UF; ++part) { + // Make sure to add the reduction stat value only to the + // first unroll part. + Value *StartVal = (part == 0) ? VectorStart : Identity; + cast<PHINode>(VecRdxPhi[part]) + ->addIncoming(StartVal, LoopVectorPreHeader); + cast<PHINode>(VecRdxPhi[part]) + ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch()); + } + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + + VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); + setDebugLocFromInst(Builder, LoopExitInst); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + } + + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); + setDebugLocFromInst(Builder, ReducedPartRdx); + for (unsigned part = 1; part < UF; ++part) { + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + // Floating point operations had to be 'fast' to enable the reduction. + ReducedPartRdx = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], + ReducedPartRdx, "bin.rdx")); + else + ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( + Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); + } + + if (VF > 1) { + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = Builder.getInt32(i / 2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), "rdx.shuf"); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp( + (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); + else + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, + TmpVec, Shuf); + } + + // The result is in the first element of the vector. + ReducedPartRdx = + Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (Phi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) + : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); + } + + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); + BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); + LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) + break; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found a reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) + LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + } // end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (Instruction &LEI : *LoopExitBlock) { auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); @@ -4296,7 +4345,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { } } -void InnerLoopVectorizer::collectTriviallyDeadInstructions() { +void InnerLoopVectorizer::collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions) { BasicBlock *Latch = OrigLoop->getLoopLatch(); // We create new control-flow for the vectorized loop, so the original @@ -4563,9 +4613,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF, PhiVector *PV) { + unsigned VF) { PHINode *P = cast<PHINode>(PN); - // Handle recurrences. + // In order to support recurrences we need to be able to vectorize Phi nodes. + // Phi nodes have cycles, so we need to vectorize them in two stages. This is + // stage #1: We create a new vector PHI node with no incoming edges. We'll use + // this value when we vectorize all of the instructions that use the PHI. if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { VectorParts Entry(UF); for (unsigned part = 0; part < UF; ++part) { @@ -4576,7 +4629,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); } VectorLoopValueMap.initVector(P, Entry); - PV->push_back(P); return; } @@ -4631,7 +4683,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P); + case InductionDescriptor::IK_FpInduction: + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4641,7 +4694,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; // These are the scalar results. Notice that we don't generate vector GEPs // because scalar GEPs result in better code. ScalarParts Entry(UF); @@ -4658,30 +4711,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, VectorLoopValueMap.initScalar(P, Entry); return; } - case InductionDescriptor::IK_FpInduction: { - assert(P->getType() == II.getStartValue()->getType() && - "Types must match"); - // Handle other induction variables that are now based on the - // canonical one. - assert(P != OldInduction && "Primary induction can be integer only"); - - Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); - V = II.transform(Builder, V, PSE.getSE(), DL); - V->setName("fp.offset.idx"); - - // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal - - Value *Broadcasted = getBroadcastInstrs(V); - // After broadcasting the induction variable we need to make the vector - // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. - Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue(); - VectorParts Entry(UF); - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, - II.getInductionOpcode()); - VectorLoopValueMap.initVector(P, Entry); - return; - } } } @@ -4703,269 +4732,323 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { - // For each instruction in the old loop. - for (Instruction &I : *BB) { - - // If the instruction will become trivially dead when vectorized, we don't - // need to generate it. - if (DeadInstructions.count(&I)) - continue; +void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + return; + } - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa<BranchInst>(&I) || isa<PHINode>(&I) || - isa<DbgInfoIntrinsic>(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - continue; - } + switch (I.getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + break; + case Instruction::PHI: { + // Vectorize PHINodes. + widenPHIInstruction(&I, UF, VF); + break; + } // End of PHI. + case Instruction::GetElementPtr: { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + auto *GEP = cast<GetElementPtrInst>(&I); + VectorParts Entry(UF); - switch (I.getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF, PV); - continue; - } // End of PHI. - - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - continue; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen binops. - auto *BinOp = cast<BinaryOperator>(&I); - setDebugLocFromInst(Builder, BinOp); - const VectorParts &A = getVectorValue(BinOp->getOperand(0)); - const VectorParts &B = getVectorValue(BinOp->getOperand(1)); - - // Use this vector value for all users of the original instruction. - VectorParts Entry(UF); + if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); - if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) - VecOp->copyIRFlags(BinOp); + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getVectorValue(GEP->getPointerOperand())[Part]; + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (OrigLoop->isLoopInvariant(U.get())) + Indices.push_back(U.get()); + else + Indices.push_back(getVectorValue(U.get())[Part]); + } - Entry[Part] = V; + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = GEP->isInBounds() + ? Builder.CreateInBoundsGEP(Ptr, Indices) + : Builder.CreateGEP(Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + Entry[Part] = NewGEP; } + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, BinOp); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, GEP); + break; + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); break; } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - auto *SE = PSE.getSE(); - bool InvariantCond = - SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, &I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - const VectorParts &Cond = getVectorValue(I.getOperand(0)); - const VectorParts &Op0 = getVectorValue(I.getOperand(1)); - const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - - auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + auto *BinOp = cast<BinaryOperator>(&I); + setDebugLocFromInst(Builder, BinOp); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); + + // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); + + if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) + VecOp->copyIRFlags(BinOp); + + Entry[Part] = V; + } - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part] = Builder.CreateSelect( - InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); - } + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, BinOp); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); + + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part] = Builder.CreateSelect( + InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast<CmpInst>(&I); - setDebugLocFromInst(Builder, Cmp); - const VectorParts &A = getVectorValue(Cmp->getOperand(0)); - const VectorParts &B = getVectorValue(Cmp->getOperand(1)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *C = nullptr; - if (FCmp) { - C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - cast<FCmpInst>(C)->copyFastMathFlags(Cmp); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); - } - Entry[Part] = C; + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = dyn_cast<CmpInst>(&I); + setDebugLocFromInst(Builder, Cmp); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *C = nullptr; + if (FCmp) { + C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); + cast<FCmpInst>(C)->copyFastMathFlags(Cmp); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); } + Entry[Part] = C; + } + + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + case Instruction::Store: + case Instruction::Load: + vectorizeMemoryInstruction(&I); + break; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = dyn_cast<CastInst>(&I); + setDebugLocFromInst(Builder, CI); + + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)), + cast<TruncInst>(CI)); break; } - case Instruction::Store: - case Instruction::Load: - vectorizeMemoryInstruction(&I); + /// Vectorize casts. + Type *DestTy = + (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + + const VectorParts &A = getVectorValue(CI->getOperand(0)); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + case Instruction::Call: { + // Ignore dbg intrinsics. + if (isa<DbgInfoIntrinsic>(I)) break; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = dyn_cast<CastInst>(&I); - setDebugLocFromInst(Builder, CI); - - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - auto ID = Legal->getInductionVars()->lookup(OldInduction); - if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction && - ID.getConstIntStepValue()) { - widenIntInduction(OldInduction, cast<TruncInst>(CI)); - break; - } + setDebugLocFromInst(Builder, &I); - /// Vectorize casts. - Type *DestTy = - (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast<CallInst>(&I); - const VectorParts &A = getVectorValue(CI->getOperand(0)); - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { + scalarizeInstruction(&I); + break; + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(&I); break; } - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - break; - setDebugLocFromInst(Builder, &I); - - Module *M = BB->getParent()->getParent(); - auto *CI = cast<CallInst>(&I); - - StringRef FnName = CI->getCalledFunction()->getName(); - Function *F = CI->getCalledFunction(); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); - bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } - - VectorParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); - Arg = VectorArg[Part]; - } - Args.push_back(Arg); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; } + Args.push_back(Arg); + } - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); - assert(!VFnName.empty() && "Vector function name is empty."); - VectorF = M->getFunction(VFnName); - if (!VectorF) { - // Generate a declaration - FunctionType *FTy = FunctionType::get(RetTy, Tys, false); - VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFnName, M); - VectorF->copyAttributesFrom(F); - } + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); } - assert(VectorF && "Can't create vector function."); - - SmallVector<OperandBundleDef, 1> OpBundles; - CI->getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); + } + assert(VectorF && "Can't create vector function."); - if (isa<FPMathOperator>(V)) - V->copyFastMathFlags(CI); + SmallVector<OperandBundleDef, 1> OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - Entry[Part] = V; - } + if (isa<FPMathOperator>(V)) + V->copyFastMathFlags(CI); - VectorLoopValueMap.initVector(&I, Entry); - addMetadata(Entry, &I); - break; + Entry[Part] = V; } - default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; - } // end of switch. - } // end of for_each instr. + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, &I); + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(&I); + break; + } // end of switch. } void InnerLoopVectorizer::updateAnalysis() { @@ -4976,11 +5059,10 @@ void InnerLoopVectorizer::updateAnalysis() { assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); - // We don't predicate stores by this point, so the vector body should be a - // single loop. - DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); - - DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); + DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), + LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, + LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -5145,12 +5227,6 @@ bool LoopVectorizationLegality::canVectorize() { if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - // Collect all instructions that are known to be uniform after vectorization. - collectLoopUniforms(); - - // Collect all instructions that are known to be scalar after vectorization. - collectLoopScalars(); - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; @@ -5234,8 +5310,8 @@ void LoopVectorizationLegality::addInductionPhi( // one if there are multiple (no good reason for doing this other // than it is expedient). We've checked that it begins at zero and // steps by one, so this is a canonical induction variable. - if (!Induction || PhiTy == WidestIndTy) - Induction = Phi; + if (!PrimaryInduction || PhiTy == WidestIndTy) + PrimaryInduction = Phi; } // Both the PHI node itself, and the "post-increment" value feeding @@ -5398,7 +5474,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } // next instr. } - if (!Induction) { + if (!PrimaryInduction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { ORE->emit(createMissedAnalysis("NoInductionVariable") @@ -5410,46 +5486,166 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. - if (Induction && WidestIndTy != Induction->getType()) - Induction = nullptr; + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; return true; } -void LoopVectorizationLegality::collectLoopScalars() { +void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { + + // We should not collect Scalars more than once per VF. Right now, this + // function is called from collectUniformsAndScalars(), which already does + // this check. Collecting Scalars for VF=1 does not make any sense. + assert(VF >= 2 && !Scalars.count(VF) && + "This function should not be visited twice for the same VF"); + + SmallSetVector<Instruction *, 8> Worklist; + + // These sets are used to seed the analysis with pointers used by memory + // accesses that will remain scalar. + SmallSetVector<Instruction *, 8> ScalarPtrs; + SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; + + // A helper that returns true if the use of Ptr by MemAccess will be scalar. + // The pointer operands of loads and stores will be scalar as long as the + // memory access is not a gather or scatter operation. The value operand of a + // store will remain scalar if the store is scalarized. + auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { + InstWidening WideningDecision = getWideningDecision(MemAccess, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + if (auto *Store = dyn_cast<StoreInst>(MemAccess)) + if (Ptr == Store->getValueOperand()) + return WideningDecision == CM_Scalarize; + assert(Ptr == getPointerOperand(MemAccess) && + "Ptr is neither a value or pointer operand"); + return WideningDecision != CM_GatherScatter; + }; + + // A helper that returns true if the given value is a bitcast or + // getelementptr instruction contained in the loop. + auto isLoopVaryingBitCastOrGEP = [&](Value *V) { + return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || + isa<GetElementPtrInst>(V)) && + !TheLoop->isLoopInvariant(V); + }; - // If an instruction is uniform after vectorization, it will remain scalar. - Scalars.insert(Uniforms.begin(), Uniforms.end()); + // A helper that evaluates a memory access's use of a pointer. If the use + // will be a scalar use, and the pointer is only used by memory accesses, we + // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. + auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - // Collect the getelementptr instructions that will not be vectorized. A - // getelementptr instruction is only vectorized if it is used for a legal - // gather or scatter operation. + // We only care about bitcast and getelementptr instructions contained in + // the loop. + if (!isLoopVaryingBitCastOrGEP(Ptr)) + return; + + // If the pointer has already been identified as scalar (e.g., if it was + // also identified as uniform), there's nothing to do. + auto *I = cast<Instruction>(Ptr); + if (Worklist.count(I)) + return; + + // If the use of the pointer will be a scalar use, and all users of the + // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, + // place the pointer in PossibleNonScalarPtrs. + if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) { + return isa<LoadInst>(U) || isa<StoreInst>(U); + })) + ScalarPtrs.insert(I); + else + PossibleNonScalarPtrs.insert(I); + }; + + // We seed the scalars analysis with three classes of instructions: (1) + // instructions marked uniform-after-vectorization, (2) bitcast and + // getelementptr instructions used by memory accesses requiring a scalar use, + // and (3) pointer induction variables and their update instructions (we + // currently only scalarize these). + // + // (1) Add to the worklist all instructions that have been identified as + // uniform-after-vectorization. + Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); + + // (2) Add to the worklist all bitcast and getelementptr instructions used by + // memory accesses requiring a scalar use. The pointer operands of loads and + // stores will be scalar as long as the memory accesses is not a gather or + // scatter operation. The value operand of a store will remain scalar if the + // store is scalarized. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { - Scalars.insert(GEP); - continue; + if (auto *Load = dyn_cast<LoadInst>(&I)) { + evaluatePtrUse(Load, Load->getPointerOperand()); + } else if (auto *Store = dyn_cast<StoreInst>(&I)) { + evaluatePtrUse(Store, Store->getPointerOperand()); + evaluatePtrUse(Store, Store->getValueOperand()); } - auto *Ptr = getPointerOperand(&I); - if (!Ptr) - continue; - auto *GEP = getGEPInstruction(Ptr); - if (GEP && isLegalGatherOrScatter(&I)) - Scalars.erase(GEP); + } + for (auto *I : ScalarPtrs) + if (!PossibleNonScalarPtrs.count(I)) { + DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + Worklist.insert(I); } + // (3) Add to the worklist all pointer induction variables and their update + // instructions. + // + // TODO: Once we are able to vectorize pointer induction variables we should + // no longer insert them into the worklist here. + auto *Latch = TheLoop->getLoopLatch(); + for (auto &Induction : *Legal->getInductionVars()) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) + continue; + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + } + + // Expand the worklist by looking through any bitcasts and getelementptr + // instructions we've already identified as scalar. This is similar to the + // expansion step in collectLoopUniforms(); however, here we're only + // expanding to include additional bitcasts and getelementptr instructions. + unsigned Idx = 0; + while (Idx != Worklist.size()) { + Instruction *Dst = Worklist[Idx++]; + if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) + continue; + auto *Src = cast<Instruction>(Dst->getOperand(0)); + if (all_of(Src->users(), [&](User *U) -> bool { + auto *J = cast<Instruction>(U); + return !TheLoop->contains(J) || Worklist.count(J) || + ((isa<LoadInst>(J) || isa<StoreInst>(J)) && + isScalarUse(J, Src)); + })) { + Worklist.insert(Src); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); + } + } + // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. - auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *getInductionVars()) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + // We already considered pointer induction variables, so there's no reason + // to look at their users again. + // + // TODO: Once we are able to vectorize pointer induction variables we + // should no longer skip over them here. + if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) + continue; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarInd) continue; @@ -5458,23 +5654,19 @@ void LoopVectorizationLegality::collectLoopScalars() { // scalar after vectorization. auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. - Scalars.insert(Ind); - Scalars.insert(IndUpdate); + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } -} -bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { - if (isAccessInterleaved(I)) - return true; - if (auto *Ptr = getPointerOperand(I)) - return isConsecutivePtr(Ptr); - return false; + Scalars[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { @@ -5494,48 +5686,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { return false; } -bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( - Instruction *I, unsigned VF) { - - // If the memory instruction is in an interleaved group, it will be - // vectorized and its pointer will remain uniform. - if (isAccessInterleaved(I)) - return false; - +bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); assert((LI || SI) && "Invalid memory instruction"); - // If the pointer operand is uniform (loop invariant), the memory instruction - // will be scalarized. auto *Ptr = getPointerOperand(I); - if (LI && isUniform(Ptr)) - return true; - // If the pointer operand is non-consecutive and neither a gather nor a - // scatter operation is legal, the memory instruction will be scalarized. - if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) - return true; + // In order to be widened, the pointer should be consecutive, first of all. + if (!isConsecutivePtr(Ptr)) + return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I)) - return true; + return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL, VF)) - return true; + return false; - // Otherwise, the memory instruction should be vectorized if the rest of the - // loop is. - return false; + return true; } -void LoopVectorizationLegality::collectLoopUniforms() { +void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { + + // We should not collect Uniforms more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Uniforms for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Uniforms.count(VF) && + "This function should not be visited twice for the same VF"); + + // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // not analyze again. Uniforms.count(VF) will return 1. + Uniforms[VF].clear(); + // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. @@ -5568,6 +5760,14 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; + auto isUniformDecision = [&](Instruction *I, unsigned VF) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + + return (WideningDecision == CM_Widen || + WideningDecision == CM_Interleave); + }; // Iterate over the instructions in the loop, and collect all // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible // that a consecutive-like pointer operand will be scalarized, we collect it @@ -5590,25 +5790,18 @@ void LoopVectorizationLegality::collectLoopUniforms() { return getPointerOperand(U) == Ptr; }); - // Ensure the memory instruction will not be scalarized, making its - // pointer operand non-uniform. If the pointer operand is used by some - // instruction other than a memory access, we're not going to check if - // that other instruction may be scalarized here. Thus, conservatively - // assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + // Ensure the memory instruction will not be scalarized or used by + // gather/scatter, making its pointer operand non-uniform. If the pointer + // operand is used by any instruction other than a memory access, we + // conservatively assume the pointer operand may be non-uniform. + if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) PossibleNonUniformPtrs.insert(Ptr); // If the memory instruction will be vectorized and its pointer operand - // is consecutive-like, the pointer operand should remain uniform. - else if (hasConsecutiveLikePtrOperand(&I)) - ConsecutiveLikePtrs.insert(Ptr); - - // Otherwise, if the memory instruction will be vectorized and its - // pointer operand is non-consecutive-like, the memory instruction should - // be a gather or scatter operation. Its pointer operand will be - // non-uniform. + // is consecutive-like, or interleaving - the pointer operand should + // remain uniform. else - PossibleNonUniformPtrs.insert(Ptr); + ConsecutiveLikePtrs.insert(Ptr); } // Add to the Worklist all consecutive and consecutive-like pointers that @@ -5632,7 +5825,9 @@ void LoopVectorizationLegality::collectLoopUniforms() { continue; auto *OI = cast<Instruction>(OV); if (all_of(OI->users(), [&](User *U) -> bool { - return isOutOfScope(U) || Worklist.count(cast<Instruction>(U)); + auto *J = cast<Instruction>(U); + return !TheLoop->contains(J) || Worklist.count(J) || + (OI == getPointerOperand(J) && isUniformDecision(J, VF)); })) { Worklist.insert(OI); DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); @@ -5643,7 +5838,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + return getPointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // For an instruction to be added into Worklist above, all its users inside @@ -5652,7 +5847,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : Inductions) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -5683,7 +5878,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } - Uniforms.insert(Worklist.begin(), Worklist.end()); + Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -5823,7 +6018,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses( uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); // An alignment of 0 means target ABI alignment. - unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned Align = getMemInstAlignment(&I); if (!Align) Align = DL.getABITypeAlignment(PtrTy->getElementType()); @@ -5978,6 +6173,11 @@ void InterleavedAccessInfo::analyzeInterleaving( if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size) continue; + // Ignore A if the memory object of A and B don't belong to the same + // address space + if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B)) + continue; + // Calculate the distance from A to B. const SCEVConstant *DistToB = dyn_cast<SCEVConstant>( PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev)); @@ -6020,36 +6220,36 @@ void InterleavedAccessInfo::analyzeInterleaving( if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // Remove interleaved groups with gaps (currently only loads) whose memory - // accesses may wrap around. We have to revisit the getPtrStride analysis, - // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does + // Remove interleaved groups with gaps (currently only loads) whose memory + // accesses may wrap around. We have to revisit the getPtrStride analysis, + // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does // not check wrapping (see documentation there). - // FORNOW we use Assume=false; - // TODO: Change to Assume=true but making sure we don't exceed the threshold + // FORNOW we use Assume=false; + // TODO: Change to Assume=true but making sure we don't exceed the threshold // of runtime SCEV assumptions checks (thereby potentially failing to - // vectorize altogether). + // vectorize altogether). // Additional optional optimizations: - // TODO: If we are peeling the loop and we know that the first pointer doesn't + // TODO: If we are peeling the loop and we know that the first pointer doesn't // wrap then we can deduce that all pointers in the group don't wrap. - // This means that we can forcefully peel the loop in order to only have to - // check the first pointer for no-wrap. When we'll change to use Assume=true + // This means that we can forcefully peel the loop in order to only have to + // check the first pointer for no-wrap. When we'll change to use Assume=true // we'll only need at most one runtime check per interleaved group. // for (InterleaveGroup *Group : LoadGroups) { // Case 1: A full group. Can Skip the checks; For full groups, if the wide - // load would wrap around the address space we would do a memory access at - // nullptr even without the transformation. - if (Group->getNumMembers() == Group->getFactor()) + // load would wrap around the address space we would do a memory access at + // nullptr even without the transformation. + if (Group->getNumMembers() == Group->getFactor()) continue; - // Case 2: If first and last members of the group don't wrap this implies + // Case 2: If first and last members of the group don't wrap this implies // that all the pointers in the group don't wrap. // So we check only group member 0 (which is always guaranteed to exist), - // and group member Factor - 1; If the latter doesn't exist we rely on + // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reveresed accsess -- see Case 3). Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); - if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, + if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " "first group member potentially pointer-wrapping.\n"); @@ -6065,8 +6265,7 @@ void InterleavedAccessInfo::analyzeInterleaving( "last group member potentially pointer-wrapping.\n"); releaseGroup(Group); } - } - else { + } else { // Case 3: A non-reversed interleaved load group with gaps: We need // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need @@ -6082,27 +6281,62 @@ void InterleavedAccessInfo::analyzeInterleaving( } } -LoopVectorizationCostModel::VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { - // Width 1 means no vectorize - VectorizationFactor Factor = {1U, 0U}; - if (OptForSize && Legal->getRuntimePointerChecking()->Need) { +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + return None; + } + + if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. + return computeFeasibleMaxVF(OptForSize); + + if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return Factor; + return None; } - if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); - return Factor; + // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + + // If we don't know the precise trip count, don't try to vectorize. + if (TC < 2) { + ORE->emit( + createMissedAnalysis("UnknownLoopCountComplexCFG") + << "unable to calculate the loop count due to complex control flow"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + return None; } + unsigned MaxVF = computeFeasibleMaxVF(OptForSize); + + if (TC % MaxVF != 0) { + // If the trip count that we found modulo the vectorization factor is not + // zero then we require a tail. + // FIXME: look for a smaller MaxVF that does divide TC rather than give up. + // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a + // smaller MaxVF that does not require a scalar epilog. + + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os/-Oz"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + return None; + } + + return MaxVF; +} + +unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -6136,7 +6370,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); - unsigned VF = MaxVectorSize; + unsigned MaxVF = MaxVectorSize; if (MaximizeBandwidth && !OptForSize) { // Collect all viable vectorization factors. SmallVector<unsigned, 8> VFs; @@ -6152,54 +6386,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { - VF = VFs[i]; + MaxVF = VFs[i]; break; } } } + return MaxVF; +} - // If we optimize the program for size, avoid creating the tail loop. - if (OptForSize) { - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - - // If we don't know the precise trip count, don't try to vectorize. - if (TC < 2) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); - return Factor; - } - - // Find the maximum SIMD width that can fit within the trip count. - VF = TC % MaxVectorSize; - - if (VF == 0) - VF = MaxVectorSize; - else { - // If the trip count that we found modulo the vectorization factor is not - // zero then we require a tail. - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); - return Factor; - } - } - - int UserVF = Hints->getWidth(); - if (UserVF != 0) { - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - - Factor.Width = UserVF; - collectInstsToScalarize(UserVF); - return Factor; - } - +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { float Cost = expectedCost(1).first; #ifndef NDEBUG const float ScalarCost = Cost; @@ -6209,12 +6405,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; // Ignore scalar width, because the user explicitly wants vectorization. - if (ForceVectorization && VF > 1) { + if (ForceVectorization && MaxVF > 1) { Width = 2; Cost = expectedCost(Width).first / (float)Width; } - for (unsigned i = 2; i <= VF; i *= 2) { + for (unsigned i = 2; i <= MaxVF; i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6238,8 +6434,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - Factor.Width = Width; - Factor.Cost = Width * Cost; + VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; return Factor; } @@ -6277,9 +6472,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { T = ST->getValueOperand()->getType(); // Ignore loaded pointer types and stored pointer types that are not - // consecutive. However, we do want to take consecutive stores/loads of - // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I)) + // vectorizable. + // + // FIXME: The check here attempts to predict whether a load or store will + // be vectorized. We only know this for certain after a VF has + // been selected. Here, we assume that if an access can be + // vectorized, it will be. We should also look at extending this + // optimization to non-pointer types. + // + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && + !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I)) continue; MinWidth = std::min(MinWidth, @@ -6562,12 +6764,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); continue; } - + collectUniformsAndScalars(VFs[j]); // Count the number of live intervals. unsigned RegUsage = 0; for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst)) + if (VecValuesToIgnore.count(Inst) || + isScalarAfterVectorization(Inst, VFs[j])) continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); } @@ -6628,6 +6831,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { ScalarCostsTy ScalarCosts; if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + + // Remember that BB will remain after vectorization. + PredicatedBBsAfterVectorization.insert(BB); } } } @@ -6636,7 +6842,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, unsigned VF) { - assert(!Legal->isUniformAfterVectorization(PredInst) && + assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6657,7 +6863,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - Legal->isScalarAfterVectorization(I)) + isScalarAfterVectorization(I, VF)) return false; // If the instruction is scalar with predication, it will be analyzed @@ -6677,7 +6883,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) - if (Legal->isUniformAfterVectorization(J)) + if (isUniformAfterVectorization(J, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6690,7 +6896,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // and their return values are inserted into vectors. Thus, an extract would // still be required. auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); }; // Compute the expected cost discount from scalarizing the entire expression @@ -6717,8 +6923,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { - ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, - false, TTI); + ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), + true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); } @@ -6733,8 +6939,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (canBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J)) - ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), - false, true, TTI); + ScalarCost += TTI.getScalarizationOverhead( + ToVectorTy(J->getType(),VF), false, true); } // Scale the total scalar cost by block probability. @@ -6753,6 +6959,9 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. collectInstsToScalarize(VF); @@ -6832,11 +7041,141 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { Legal->hasStride(I->getOperand(1)); } +unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + auto SE = PSE.getSE(); + + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAddressSpace(I); + Value *Ptr = getPointerOperand(I); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + + Cost += VF * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, I); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + + return Cost; +} + +unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + unsigned AS = getMemInstAddressSpace(I); + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + + assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && + "Stride should be 1 or -1 for consecutive memory access"); + unsigned Cost = 0; + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + + bool Reverse = ConsecutiveStride < 0; + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, + unsigned VF) { + LoadInst *LI = cast<LoadInst>(I); + Type *ValTy = LI->getType(); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = LI->getAlignment(); + unsigned AS = LI->getPointerAddressSpace(); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); +} + +unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = getMemInstAlignment(I); + Value *Ptr = getPointerOperand(I); + + return TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); +} + +unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + unsigned VF) { + Type *ValTy = getMemInstValueType(I); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned AS = getMemInstAddressSpace(I); + + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it doesn't allow gaps. + SmallVector<unsigned, 4> Indices; + if (isa<LoadInst>(I)) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + +unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, + unsigned VF) { + + // Calculate scalar cost only. Vectorization cost should be ready at this + // moment. + if (VF == 1) { + Type *ValTy = getMemInstValueType(I); + unsigned Alignment = getMemInstAlignment(I); + unsigned AS = getMemInstAlignment(I); + + return TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + } + return getWideningCost(I, VF); +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (Legal->isUniformAfterVectorization(I)) + if (isUniformAfterVectorization(I, VF)) VF = 1; if (VF > 1 && isProfitableToScalarize(I, VF)) @@ -6850,6 +7189,79 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(C, TypeNotScalarized); } +void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { + if (VF == 1) + return; + for (BasicBlock *BB : TheLoop->blocks()) { + // For each instruction in the old loop. + for (Instruction &I : *BB) { + Value *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + + if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) { + // Scalar load + broadcast + unsigned Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + continue; + } + + // We assume that widening is the best solution when possible. + if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + unsigned Cost = getConsecutiveMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Widen, Cost); + continue; + } + + // Choose between Interleaving, Gather/Scatter or Scalarization. + unsigned InterleaveCost = UINT_MAX; + unsigned NumAccesses = 1; + if (Legal->isAccessInterleaved(&I)) { + auto Group = Legal->getInterleavedAccessGroup(&I); + assert(Group && "Fail to get an interleaved access group."); + + // Make one decision for the whole group. + if (getWideningDecision(&I, VF) != CM_Unknown) + continue; + + NumAccesses = Group->getNumMembers(); + InterleaveCost = getInterleaveGroupCost(&I, VF); + } + + unsigned GatherScatterCost = + Legal->isLegalGatherOrScatter(&I) + ? getGatherScatterCost(&I, VF) * NumAccesses + : UINT_MAX; + + unsigned ScalarizationCost = + getMemInstScalarizationCost(&I, VF) * NumAccesses; + + // Choose better solution for the current VF, + // write down this decision and use it during vectorization. + unsigned Cost; + InstWidening Decision; + if (InterleaveCost <= GatherScatterCost && + InterleaveCost < ScalarizationCost) { + Decision = CM_Interleave; + Cost = InterleaveCost; + } else if (GatherScatterCost < ScalarizationCost) { + Decision = CM_GatherScatter; + Cost = GatherScatterCost; + } else { + Decision = CM_Scalarize; + Cost = ScalarizationCost; + } + // If the instructions belongs to an interleave group, the whole group + // receives the same decision. The whole group receives the cost, but + // the cost will actually be assigned to one instruction. + if (auto Group = Legal->getInterleavedAccessGroup(&I)) + setWideningDecision(Group, VF, Decision, Cost); + else + setWideningDecision(&I, VF, Decision, Cost); + } + } +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { @@ -6868,7 +7280,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // instruction cost. return 0; case Instruction::Br: { - return TTI.getCFInstrCost(I->getOpcode()); + // In cases of scalarized and predicated instructions, there will be VF + // predicated blocks in the vectorized loop. Each branch around these + // blocks requires also an extract of its vector compare i1 element. + bool ScalarPredicatedBB = false; + BranchInst *BI = cast<BranchInst>(I); + if (VF > 1 && BI->isConditional() && + (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + ScalarPredicatedBB = true; + + if (ScalarPredicatedBB) { + // Return cost for branches around scalarized and predicated blocks. + Type *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + + (TTI.getCFInstrCost(Instruction::Br) * VF)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + // The back-edge branch will remain, as will all scalar branches. + return TTI.getCFInstrCost(Instruction::Br); + else + // This branch will be eliminated by if-conversion. + return 0; + // Note: We currently assume zero cost for an unconditional branch inside + // a predicated block since it will become a fall-through, although we + // may decide in the future to call TTI for all branches. } case Instruction::PHI: { auto *Phi = cast<PHINode>(I); @@ -6969,7 +7405,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -6978,130 +7414,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); } case Instruction::Store: case Instruction::Load: { - StoreInst *SI = dyn_cast<StoreInst>(I); - LoadInst *LI = dyn_cast<LoadInst>(I); - Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); - VectorTy = ToVectorTy(ValTy, VF); - - unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); - unsigned AS = - SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); - Value *Ptr = getPointerOperand(I); - // We add the cost of address computation here instead of with the gep - // instruction because only here we know whether the operation is - // scalarized. - if (VF == 1) - return TTI.getAddressComputationCost(VectorTy) + - TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (LI && Legal->isUniform(Ptr)) { - // Scalar load + broadcast - unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); - Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - return Cost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); - } - - // For an interleaved access, calculate the total cost of the whole - // interleave group. - if (Legal->isAccessInterleaved(I)) { - auto Group = Legal->getInterleavedAccessGroup(I); - assert(Group && "Fail to get an interleaved access group."); - - // Only calculate the cost once at the insert position. - if (Group->getInsertPos() != I) - return 0; - - unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = - VectorType::get(VectorTy->getVectorElementType(), - VectorTy->getVectorNumElements() * InterleaveFactor); - - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. - SmallVector<unsigned, 4> Indices; - if (LI) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } - - // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) - Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - - // FIXME: The interleaved load group with a huge gap could be even more - // expensive than scalar operations. Then we could ignore such group and - // use scalar operations instead. - return Cost; - } - - // Check if the memory instruction will be scalarized. - if (Legal->memoryInstructionMustBeScalarized(I, VF)) { - unsigned Cost = 0; - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - - // Figure out whether the access is strided and get the stride value - // if it's known in compile time - const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); - - // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - - // Get the overhead of the extractelement and insertelement instructions - // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); - - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. - if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); - - return Cost; - } - - // Determine if the pointer operand of the access is either consecutive or - // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. - bool UseGatherOrScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); - - unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (UseGatherOrScatter) { - assert(ConsecutiveStride == 0 && - "Gather/Scatter are not used for consecutive stride"); - return Cost + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); - } - // Wide load/stores. - if (Legal->isMaskRequired(I)) - Cost += - TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - return Cost; + VectorTy = ToVectorTy(getMemInstValueType(I), VF); + return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: case Instruction::SExt: @@ -7115,12 +7433,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - // We optimize the truncation of induction variable. - // The cost of these is the same as the scalar operation. - if (I->getOpcode() == Instruction::Trunc && - Legal->isInductionVariable(I->getOperand(0))) - return TTI.getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType()); + // We optimize the truncation of induction variables having constant + // integer steps. The cost of these truncations is the same as the scalar + // operation. + if (isOptimizableIVTruncate(I, VF)) { + auto *Trunc = cast<TruncInst>(I); + return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), + Trunc->getSrcTy(), Trunc); + } Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); @@ -7143,7 +7463,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7172,9 +7492,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) @@ -7206,81 +7524,34 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - - // Insert values known to be scalar into VecValuesToIgnore. This is a - // conservative estimation of the values that will later be scalarized. - // - // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may - // still be scalarized. For example, we may find an instruction to be - // more profitable for a given vectorization factor if it were to be - // scalarized. But at this point, we haven't yet computed the - // vectorization factor. - for (auto *BB : TheLoop->getBlocks()) - for (auto &I : *BB) - if (Legal->isScalarAfterVectorization(&I)) - VecValuesToIgnore.insert(&I); } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr) { - assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - // Holds vector parameters or scalars, in case of uniform vals. - SmallVector<VectorParts, 4> Params; - - setDebugLocFromInst(Builder, Instr); - - // Does this instruction return a value ? - bool IsVoidRetTy = Instr->getType()->isVoidTy(); - - // Initialize a new scalar map entry. - ScalarParts Entry(UF); - - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); - - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { - Entry[Part].resize(1); - // For each scalar that we create: - - // Start an "if (pred) a[i] = ..." block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - if (Cond[Part]->getType()->isVectorTy()) - Cond[Part] = - Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], - ConstantInt::get(Cond[Part]->getType(), 1)); - } - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); - Cloned->setOperand(op, NewOp); - } +LoopVectorizationCostModel::VectorizationFactor +LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Width 1 means no vectorize, cost 0 means uncomputed cost. + const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, + 0U}; + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); + if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. + return NoVectorization; - // Add the cloned scalar to the scalar map entry. - Entry[Part][0] = Cloned; + if (UserVF) { + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + CM.selectUserVectorizationFactor(UserVF); + return {UserVF, 0}; + } - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + unsigned MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF != 0 && "MaxVF is zero."); + if (MaxVF == 1) + return NoVectorization; - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); - } - VectorLoopValueMap.initScalar(Instr, Entry); + // Select the optimal vectorization factor. + return CM.selectVectorizationFactor(MaxVF); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { @@ -7414,11 +7685,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); - CM.collectValuesToIgnore(); - // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = @@ -7464,9 +7730,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Select the optimal vectorization factor. - const LoopVectorizationCostModel::VectorizationFactor VF = - CM.selectVectorizationFactor(OptForSize); + // Use the cost model. + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, + &Hints); + CM.collectValuesToIgnore(); + + // Use the planner for vectorization. + LoopVectorizationPlanner LVP(CM); + + // Get user vectorization factor. + unsigned UserVF = Hints.getWidth(); + + // Plan how to best vectorize, return the best VF and its cost. + LoopVectorizationCostModel::VectorizationFactor VF = + LVP.plan(OptForSize, UserVF); // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); @@ -7522,10 +7799,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. - ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second); - ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second); return false; @@ -7621,6 +7898,16 @@ bool LoopVectorizePass::runImpl( if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; + bool Changed = false; + + // The vectorizer requires loops to be in simplified form. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop vectorizer + // will simplify all loops, regardless of whether anything end up being + // vectorized. + for (auto &L : *LI) + Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */); + // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -7632,9 +7919,15 @@ bool LoopVectorizePass::runImpl( LoopsAnalyzed += Worklist.size(); // Now walk the identified inner loops. - bool Changed = false; - while (!Worklist.empty()) - Changed |= processLoop(Worklist.pop_back_val()); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + + // For the inner loops we actually process, form LCSSA to simplify the + // transform. + Changed |= formLCSSARecursively(*L, *DT, LI, SE); + + Changed |= processLoop(L); + } // Process each loop nest in the function. return Changed; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 328f27002960..da3ac06ab464 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -39,6 +39,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> @@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +static cl::opt<bool> + ViewSLPTree("view-slp-tree", cl::Hidden, + cl::desc("Display the SLP trees with Graphviz")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { /// Flag set: NSW, NUW, exact, and all of fast-math. static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { if (auto *VecOp = dyn_cast<Instruction>(I)) { - if (auto *Intersection = dyn_cast<Instruction>(VL[0])) { - // Intersection is initialized to the 0th scalar, - // so start counting from index '1'. + if (auto *I0 = dyn_cast<Instruction>(VL[0])) { + // VecOVp is initialized to the 0th scalar, so start counting from index + // '1'. + VecOp->copyIRFlags(I0); for (int i = 1, e = VL.size(); i < e; ++i) { if (auto *Scalar = dyn_cast<Instruction>(VL[i])) - Intersection->andIRFlags(Scalar); + VecOp->andIRFlags(Scalar); } - VecOp->copyIRFlags(Intersection); } } } @@ -304,6 +309,8 @@ public: typedef SmallVector<Instruction *, 16> InstrList; typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; + typedef MapVector<Value *, SmallVector<Instruction *, 2>> + ExtraValueToDebugLocsMap; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, @@ -330,6 +337,10 @@ public: /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); + /// Vectorize the tree but with the list of externally used values \p + /// ExternallyUsedValues. Values in this MapVector can be replaced but the + /// generated extractvalue instructions. + Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -343,6 +354,13 @@ public: /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst = None); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking + /// into account (anf updating it, if required) list of externally used + /// values stored in \p ExternallyUsedValues. + void buildTree(ArrayRef<Value *> Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -404,7 +422,7 @@ private: int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). @@ -451,8 +469,9 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0) {} + TreeEntry(std::vector<TreeEntry> &Container) + : Scalars(), VectorizedValue(nullptr), NeedToGather(0), + Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef<Value *> VL) const { @@ -468,11 +487,24 @@ private: /// Do we need to gather this sequence ? bool NeedToGather; + + /// Points back to the VectorizableTree. + /// + /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has + /// to be a pointer and needs to be able to initialize the child iterator. + /// Thus we need a reference back to the container to translate the indices + /// to entries. + std::vector<TreeEntry> &Container; + + /// The TreeEntry index containing the user of this entry. We can actually + /// have multiple users so the data structure is not truly a tree. + SmallVector<int, 1> UserTreeIndices; }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { - VectorizableTree.emplace_back(); + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + int &UserTreeIdx) { + VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -485,6 +517,10 @@ private: } else { MustGather.insert(VL.begin(), VL.end()); } + + if (UserTreeIdx >= 0) + Last->UserTreeIndices.push_back(UserTreeIdx); + UserTreeIdx = idx; return Last; } @@ -558,7 +594,9 @@ private: SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions; /// A list of values that need to extracted out of the tree. - /// This list holds pairs of (Internal Scalar : External User). + /// This list holds pairs of (Internal Scalar : External User). External User + /// can be nullptr, it means that this Internal Scalar will be used later, + /// after vectorization. UserList ExternalUses; /// Values used only by @llvm.assume calls. @@ -706,6 +744,8 @@ private: return os; } #endif + friend struct GraphTraits<BoUpSLP *>; + friend struct DOTGraphTraits<BoUpSLP *>; /// Contains all scheduling data for a basic block. /// @@ -916,17 +956,98 @@ private: /// original width. MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; }; +} // end namespace slpvectorizer + +template <> struct GraphTraits<BoUpSLP *> { + typedef BoUpSLP::TreeEntry TreeEntry; + + /// NodeRef has to be a pointer per the GraphWriter. + typedef TreeEntry *NodeRef; + + /// \brief Add the VectorizableTree to the index iterator to be able to return + /// TreeEntry pointers. + struct ChildIteratorType + : public iterator_adaptor_base<ChildIteratorType, + SmallVector<int, 1>::iterator> { + + std::vector<TreeEntry> &VectorizableTree; + + ChildIteratorType(SmallVector<int, 1>::iterator W, + std::vector<TreeEntry> &VT) + : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} + + NodeRef operator*() { return &VectorizableTree[*I]; } + }; + + static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; } + + static ChildIteratorType child_begin(NodeRef N) { + return {N->UserTreeIndices.begin(), N->Container}; + } + static ChildIteratorType child_end(NodeRef N) { + return {N->UserTreeIndices.end(), N->Container}; + } + + /// For the node iterator we just need to turn the TreeEntry iterator into a + /// TreeEntry* iterator so that it dereferences to NodeRef. + typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator; + + static nodes_iterator nodes_begin(BoUpSLP *R) { + return nodes_iterator(R->VectorizableTree.begin()); + } + static nodes_iterator nodes_end(BoUpSLP *R) { + return nodes_iterator(R->VectorizableTree.end()); + } + + static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } +}; + +template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { + typedef BoUpSLP::TreeEntry TreeEntry; + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { + std::string Str; + raw_string_ostream OS(Str); + if (isSplat(Entry->Scalars)) { + OS << "<splat> " << *Entry->Scalars[0]; + return Str; + } + for (auto V : Entry->Scalars) { + OS << *V; + if (std::any_of( + R->ExternalUses.begin(), R->ExternalUses.end(), + [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) + OS << " <extract>"; + OS << "\n"; + } + return Str; + } + + static std::string getNodeAttributes(const TreeEntry *Entry, + const BoUpSLP *) { + if (Entry->NeedToGather) + return "color=red"; + return ""; + } +}; } // end namespace llvm -} // end namespace slpvectorizer void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { + ExtraValueToDebugLocsMap ExternallyUsedValues; + buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +} +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0); + buildTree_rec(Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -940,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, if (Entry->NeedToGather) continue; + // Check if the scalar is externally used as an extra arg. + auto ExtI = ExternallyUsedValues.find(Scalar); + if (ExtI != ExternallyUsedValues.end()) { + DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.emplace_back(Scalar, nullptr, Lane); + continue; + } for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -976,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } } - -void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, + int UserTreeIdx) { bool isAltShuffle = false; assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } unsigned Opcode = getSameOpcode(VL); @@ -1014,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1026,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1039,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } + // Record the reuse of the tree node. FIXME, currently this is only used to + // properly draw the graph rather than for the actual vectorization. + E->UserTreeIndices.push_back(UserTreeIdx); DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); return; } @@ -1052,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1062,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1076,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1085,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1100,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1117,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1132,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1144,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } else { BS.cancelScheduling(VL); } - newTreeEntry(VL, Reuse); + newTreeEntry(VL, Reuse, UserTreeIdx); return; } case Instruction::Load: { @@ -1160,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1171,7 +1303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { LoadInst *L = cast<LoadInst>(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1193,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1208,7 +1340,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1235,12 +1367,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1249,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1263,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1278,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1301,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1309,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1320,7 +1452,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1330,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (cast<Instruction>(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1343,7 +1475,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1355,12 +1487,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1368,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1377,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1400,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1414,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1425,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1438,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1453,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { CallInst *CI2 = dyn_cast<CallInst>(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1462,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1, UserTreeIdx); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1484,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (Value *j : VL) Operands.push_back(cast<Instruction>(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } default: BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1570,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // If we have computed a smaller type for the expression, update VecTy so @@ -1599,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast<Instruction>(VL[i]); - if (E->hasOneUse()) + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (E->hasOneUse() || + std::all_of(E->user_begin(), E->user_end(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0; + })) // Take credit for instruction that will become dead. DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); @@ -1624,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); + VL0->getType(), SrcTy, VL0); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -1636,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -1720,18 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); + VecTy, alignment, 0, VL0); return VecStCost - ScalarStCost; } case Instruction::Call: { @@ -1739,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector<Type*, 4> ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + SmallVector<Type*, 4> ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } FastMathFlags FMF; if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) @@ -1753,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + SmallVector<Value *, 4> Args(CI->arg_operands()); + int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, + VecTy->getNumElements()); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -1947,9 +2086,18 @@ int BoUpSLP::getTreeCost() { int SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"); + std::string Str; + { + raw_string_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; + } + DEBUG(dbgs() << Str); + + if (ViewSLPTree) + ViewGraph(this, "SLP" + F->getName(), false, Str); + return Cost; } @@ -2702,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *BoUpSLP::vectorizeTree() { + ExtraValueToDebugLocsMap ExternallyUsedValues; + return vectorizeTree(ExternallyUsedValues); +} + +Value * +BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { @@ -2744,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() { // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (!is_contained(Scalar->users(), User)) + if (User && !is_contained(Scalar->users(), User)) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -2756,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() { assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); + // If User == nullptr, the Scalar is used as extra arg. Generate + // ExtractElement instruction and update the record for this scalar in + // ExternallyUsedValues. + if (!User) { + assert(ExternallyUsedValues.count(Scalar) && + "Scalar with nullptr as an external user must be registered in " + "ExternallyUsedValues map"); + if (auto *VecI = dyn_cast<Instruction>(Vec)) { + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); + } else { + Builder.SetInsertPoint(&F->getEntryBlock().front()); + } + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); + CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); + auto &Locs = ExternallyUsedValues[Scalar]; + ExternallyUsedValues.insert({Ex, Locs}); + ExternallyUsedValues.erase(Scalar); + continue; + } + // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast<Instruction>(Vec)) { @@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // sorted by the original instruction location. This lets the final schedule // be as close as possible to the original instruction order. struct ScheduleDataCompare { - bool operator()(ScheduleData *SD1, ScheduleData *SD2) { + bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { return SD2->SchedulingPriority < SD1->SchedulingPriority; } }; @@ -3645,9 +3821,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); if (!Changed) return PreservedAnalyses::all(); + PreservedAnalyses PA; - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); + PA.preserveSet<CFGAnalyses>(); PA.preserve<AAManager>(); PA.preserve<GlobalsAA>(); return PA; @@ -4026,36 +4202,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; + Value *P = V->getParent(); + + // Vectorize in current basic block only. + auto *Op0 = dyn_cast<Instruction>(V->getOperand(0)); + auto *Op1 = dyn_cast<Instruction>(V->getOperand(1)); + if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) + return false; + // Try to vectorize V. - if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + if (tryToVectorizePair(Op0, Op1, R)) return true; - BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); - BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); + auto *A = dyn_cast<BinaryOperator>(Op0); + auto *B = dyn_cast<BinaryOperator>(Op1); // Try to skip B. if (B && B->hasOneUse()) { - BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); - BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); - if (tryToVectorizePair(A, B0, R)) { + auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); + auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); + if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) return true; - } - if (tryToVectorizePair(A, B1, R)) { + if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) return true; - } } // Try to skip A. if (A && A->hasOneUse()) { - BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); - BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); - if (tryToVectorizePair(A0, B, R)) { + auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); + auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); + if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) return true; - } - if (tryToVectorizePair(A1, B, R)) { + if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) return true; - } } - return 0; + return false; } /// \brief Generate a shuffle mask to be used in a reduction tree. @@ -4119,37 +4299,41 @@ namespace { class HorizontalReduction { SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; + // Use map vector to make stable output. + MapVector<Instruction *, Value *> ExtraArgs; - BinaryOperator *ReductionRoot; - // After successfull horizontal reduction vectorization attempt for PHI node - // vectorizer tries to update root binary op by combining vectorized tree and - // the ReductionPHI node. But during vectorization this ReductionPHI can be - // vectorized itself and replaced by the undef value, while the instruction - // itself is marked for deletion. This 'marked for deletion' PHI node then can - // be used in new binary operation, causing "Use still stuck around after Def - // is destroyed" crash upon PHI node deletion. - WeakVH ReductionPHI; + BinaryOperator *ReductionRoot = nullptr; /// The opcode of the reduction. - unsigned ReductionOpcode; + Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd; /// The opcode of the values we perform a reduction on. - unsigned ReducedValueOpcode; + unsigned ReducedValueOpcode = 0; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. - bool IsPairwiseReduction; + bool IsPairwiseReduction = false; + + /// Checks if the ParentStackElem.first should be marked as a reduction + /// operation with an extra argument or as extra argument itself. + void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, + Value *ExtraArg) { + if (ExtraArgs.count(ParentStackElem.first)) { + ExtraArgs[ParentStackElem.first] = nullptr; + // We ran into something like: + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. + // The whole ParentStackElem.first should be considered as an extra value + // in this case. + // Do not perform analysis of remaining operands of ParentStackElem.first + // instruction, this whole instruction is an extra argument. + ParentStackElem.second = ParentStackElem.first->getNumOperands(); + } else { + // We ran into something like: + // ParentStackElem.first += ... + ExtraArg + ... + ExtraArgs[ParentStackElem.first] = ExtraArg; + } + } public: - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; - - /// Minimal width of available vector registers. It's used to determine - /// ReduxWidth. - unsigned MinVecRegSize; - - HorizontalReduction(unsigned MinVecRegSize) - : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), - IsPairwiseReduction(false), ReduxWidth(0), - MinVecRegSize(MinVecRegSize) {} + HorizontalReduction() = default; /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -4176,21 +4360,14 @@ public: if (!isValidElementType(Ty)) return false; - const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. - ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; - ReductionPHI = Phi; - - if (ReduxWidth < 4) - return false; // We currently only support adds. - if (ReductionOpcode != Instruction::Add && - ReductionOpcode != Instruction::FAdd) + if ((ReductionOpcode != Instruction::Add && + ReductionOpcode != Instruction::FAdd) || + !B->isAssociative()) return false; // Post order traverse the reduction tree starting at B. We only handle true @@ -4202,30 +4379,26 @@ public: unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; - // Only handle trees in the current basic block. - if (TreeN->getParent() != B->getParent()) - return false; - - // Each tree node needs to have one user except for the ultimate - // reduction. - if (!TreeN->hasOneUse() && TreeN != B) - return false; - // Postorder vist. if (EdgeToVist == 2 || IsReducedValue) { - if (IsReducedValue) { - // Make sure that the opcodes of the operations that we are going to - // reduce match. - if (!ReducedValueOpcode) - ReducedValueOpcode = TreeN->getOpcode(); - else if (ReducedValueOpcode != TreeN->getOpcode()) - return false; + if (IsReducedValue) ReducedVals.push_back(TreeN); - } else { - // We need to be able to reassociate the adds. - if (!TreeN->isAssociative()) - return false; - ReductionOps.push_back(TreeN); + else { + auto I = ExtraArgs.find(TreeN); + if (I != ExtraArgs.end() && !I->second) { + // Check if TreeN is an extra argument of its parent operation. + if (Stack.size() <= 1) { + // TreeN can't be an extra argument as it is a root reduction + // operation. + return false; + } + // Yes, TreeN is an extra argument, do not add it to a list of + // reduction operations. + // Stack[Stack.size() - 2] always points to the parent operation. + markExtraArg(Stack[Stack.size() - 2], TreeN); + ExtraArgs.erase(TreeN); + } else + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -4242,13 +4415,44 @@ public: // reduced value class. if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || I->getOpcode() == ReductionOpcode)) { - if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode) + // Only handle trees in the current basic block. + if (I->getParent() != B->getParent()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + + // Each tree node needs to have one user except for the ultimate + // reduction. + if (!I->hasOneUse() && I != B) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + + if (I->getOpcode() == ReductionOpcode) { + // We need to be able to reassociate the reduction operations. + if (!I->isAssociative()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } + } else if (ReducedValueOpcode && + ReducedValueOpcode != I->getOpcode()) { + // Make sure that the opcodes of the operations that we are going to + // reduce match. + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } else if (!ReducedValueOpcode) ReducedValueOpcode = I->getOpcode(); + Stack.push_back(std::make_pair(I, 0)); continue; } - return false; } + // NextV is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), NextV); } return true; } @@ -4259,10 +4463,15 @@ public: if (ReducedVals.empty()) return false; + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < ReduxWidth) + if (NumReducedVals < 4) return false; + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; @@ -4270,20 +4479,26 @@ public: Builder.setFastMathFlags(Unsafe); unsigned i = 0; - for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; + // The same extra argument may be used several time, so log each attempt + // to use it. + for (auto &Pair : ExtraArgs) + ExternallyUsedValues[Pair.second].push_back(Pair.first); + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ReductionOps); + V.buildTree(VL, ExternallyUsedValues, ReductionOps); if (V.shouldReorder()) { SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ReductionOps); + V.buildTree(Reversed, ExternallyUsedValues, ReductionOps); } if (V.isTreeTinyAndNotFullyVectorizable()) - continue; + break; V.computeMinimumValueSizes(); // Estimate cost. - int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); + int Cost = + V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); if (Cost >= -SLPCostThreshold) break; @@ -4292,33 +4507,44 @@ public: // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(); + Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); // Emit a reduction. - Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); + Value *ReducedSubTree = + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedSubTree, "bin.rdx"); + VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, + ReducedSubTree, "bin.rdx"); + propagateIRFlags(VectorizedTree, ReductionOps); } else VectorizedTree = ReducedSubTree; + i += ReduxWidth; + ReduxWidth = PowerOf2Floor(NumReducedVals - i); } if (VectorizedTree) { // Finish the reduction. for (; i < NumReducedVals; ++i) { - Builder.SetCurrentDebugLocation( - cast<Instruction>(ReducedVals[i])->getDebugLoc()); - VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, - ReducedVals[i]); + auto *I = cast<Instruction>(ReducedVals[i]); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + VectorizedTree = + Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); + propagateIRFlags(VectorizedTree, ReductionOps); + } + for (auto &Pair : ExternallyUsedValues) { + assert(!Pair.second.empty() && + "At least one DebugLoc must be inserted"); + // Add each externally used value to the final reduction. + for (auto *I : Pair.second) { + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, + Pair.first, "bin.extra"); + propagateIRFlags(VectorizedTree, I); + } } // Update users. - if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) { - assert(ReductionRoot && "Need a reduction operation"); - ReductionRoot->setOperand(0, VectorizedTree); - ReductionRoot->setOperand(1, ReductionPHI); - } else - ReductionRoot->replaceAllUsesWith(VectorizedTree); + ReductionRoot->replaceAllUsesWith(VectorizedTree); } return VectorizedTree != nullptr; } @@ -4329,7 +4555,8 @@ public: private: /// \brief Calculate the cost of a reduction. - int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { + int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal, + unsigned ReduxWidth) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -4352,15 +4579,9 @@ private: return VecReduxCost - ScalarReduxCost; } - static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, - Value *R, const Twine &Name = "") { - if (Opcode == Instruction::FAdd) - return Builder.CreateFAdd(L, R, Name); - return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); - } - /// \brief Emit a horizontal reduction of the vectorized value. - Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { + Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, + unsigned ReduxWidth, ArrayRef<Value *> RedOps) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); @@ -4378,15 +4599,16 @@ private: Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); + TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, + "bin.rdx"); } else { Value *UpperHalf = createRdxShuffleMask(ReduxWidth, i, false, false, Builder); Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); } + propagateIRFlags(TmpVec, RedOps); } // The result is in the first element of the vector. @@ -4438,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem, static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { - if (!IV->hasOneUse()) - return false; - Value *V = IV->getAggregateOperand(); - if (!isa<UndefValue>(V)) { - InsertValueInst *I = dyn_cast<InsertValueInst>(V); - if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds)) + Value *V; + do { + BuildVector.push_back(IV); + BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + V = IV->getAggregateOperand(); + if (isa<UndefValue>(V)) + break; + IV = dyn_cast<InsertValueInst>(V); + if (!IV || !IV->hasOneUse()) return false; - } - BuildVector.push_back(IV); - BuildVectorOpds.push_back(IV->getInsertedValueOperand()); + } while (true); + std::reverse(BuildVector.begin(), BuildVector.end()); + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -4507,29 +4732,137 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; } +namespace { +/// Tracks instructons and its children. +class WeakVHWithLevel final : public CallbackVH { + /// Operand index of the instruction currently beeing analized. + unsigned Level = 0; + /// Is this the instruction that should be vectorized, or are we now + /// processing children (i.e. operands of this instruction) for potential + /// vectorization? + bool IsInitial = true; + +public: + explicit WeakVHWithLevel() = default; + WeakVHWithLevel(Value *V) : CallbackVH(V){}; + /// Restart children analysis each time it is repaced by the new instruction. + void allUsesReplacedWith(Value *New) override { + setValPtr(New); + Level = 0; + IsInitial = true; + } + /// Check if the instruction was not deleted during vectorization. + bool isValid() const { return !getValPtr(); } + /// Is the istruction itself must be vectorized? + bool isInitial() const { return IsInitial; } + /// Try to vectorize children. + void clearInitial() { IsInitial = false; } + /// Are all children processed already? + bool isFinal() const { + assert(getValPtr() && + (isa<Instruction>(getValPtr()) && + cast<Instruction>(getValPtr())->getNumOperands() >= Level)); + return getValPtr() && + cast<Instruction>(getValPtr())->getNumOperands() == Level; + } + /// Get next child operation. + Value *nextOperand() { + assert(getValPtr() && isa<Instruction>(getValPtr()) && + cast<Instruction>(getValPtr())->getNumOperands() > Level); + return cast<Instruction>(getValPtr())->getOperand(Level++); + } + virtual ~WeakVHWithLevel() = default; +}; +} // namespace + /// \brief Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding -/// the phi node P with reduction operators BI, then check if it -/// can be done. +/// the phi node P with reduction operators Root in a basic block BB, then check +/// if it can be done. /// \returns true if a horizontal reduction was matched and reduced. /// \returns false if a horizontal reduction was not matched. -static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, - BoUpSLP &R, TargetTransformInfo *TTI, - unsigned MinRegSize) { +static bool canBeVectorized( + PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI, + const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; - HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!Root) return false; - // If there is a sufficient number of reduction values, reduce - // to a nearby power-of-2. Can safely generate oversized - // vectors and rely on the backend to split them to legal sizes. - HorRdx.ReduxWidth = - std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + if (Root->getParent() != BB) + return false; + SmallVector<WeakVHWithLevel, 8> Stack(1, Root); + SmallSet<Value *, 8> VisitedInstrs; + bool Res = false; + while (!Stack.empty()) { + Value *V = Stack.back(); + if (!V) { + Stack.pop_back(); + continue; + } + auto *Inst = dyn_cast<Instruction>(V); + if (!Inst || isa<PHINode>(Inst)) { + Stack.pop_back(); + continue; + } + if (Stack.back().isInitial()) { + Stack.back().clearInitial(); + if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { + HorizontalReduction HorRdx; + if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.tryToReduce(R, TTI)) { + Res = true; + P = nullptr; + continue; + } + } + if (P) { + Inst = dyn_cast<Instruction>(BI->getOperand(0)); + if (Inst == P) + Inst = dyn_cast<Instruction>(BI->getOperand(1)); + if (!Inst) { + P = nullptr; + continue; + } + } + } + P = nullptr; + if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { + Res = true; + continue; + } + } + if (Stack.back().isFinal()) { + Stack.pop_back(); + continue; + } - return HorRdx.tryToReduce(R, TTI); + if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand())) + if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second && + Stack.size() < RecursionMaxDepth) + Stack.push_back(NextV); + } + return Res; +} + +bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, + BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI) { + if (!V) + return false; + auto *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + if (!isa<BinaryOperator>(I)) + P = nullptr; + // Try to match and vectorize a horizontal reduction. + return canBeVectorized(P, I, BB, R, TTI, + [this](BinaryOperator *BI, BoUpSLP &R) -> bool { + return tryToVectorize(BI, R); + }); } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { @@ -4599,67 +4932,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = getReductionValue(DT, P, BB, LI); - - // Check if this is a Binary Operator. - BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); - if (!BI) - continue; - // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) { + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { Changed = true; it = BB->begin(); e = BB->end(); continue; } - - Value *Inst = BI->getOperand(0); - if (Inst == P) - Inst = BI->getOperand(1); - - if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - continue; } - if (ShouldStartVectorizeHorAtStore) - if (StoreInst *SI = dyn_cast<StoreInst>(it)) - if (BinaryOperator *BinOp = - dyn_cast<BinaryOperator>(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorize(BinOp, R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ShouldStartVectorizeHorAtStore) { + if (StoreInst *SI = dyn_cast<StoreInst>(it)) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize horizontal reductions feeding into a return. - if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) - if (RI->getNumOperands() != 0) - if (BinaryOperator *BinOp = - dyn_cast<BinaryOperator>(RI->getOperand(0))) { - DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1), - R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) { + if (RI->getNumOperands() != 0) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast<CmpInst>(it)) { @@ -4672,16 +4980,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - for (int i = 0; i < 2; ++i) { - if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { - if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } + for (int I = 0; I < 2; ++I) { + if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + break; } } continue; |