diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-25 22:36:56 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:44:01 +0000 |
| commit | 0eae32dcef82f6f06de6419a0d623d7def0cc8f6 (patch) | |
| tree | 55b7e05be47b835fd137915bee1e64026c35e71c /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
| parent | 4824e7fd18a1223177218d4aec1b3c6c5c4a444e (diff) | |
| parent | 77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
12 files changed, 912 insertions, 551 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 805011191da0..81e5aa223c07 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); -// FIXME: When scalable vectorization is stable enough, change the default -// to SK_PreferFixedWidth. -static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization( - "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly), - cl::Hidden, - cl::desc("Control whether the compiler can use scalable vectors to " - "vectorize a loop"), - cl::values( - clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", - "Scalable vectorization is disabled."), - clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on", - "Scalable vectorization is available, but favor fixed-width " - "vectorization when the cost is inconclusive."), - clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", - "Scalable vectorization is available and favored when the " - "cost is inconclusive."))); +static cl::opt<LoopVectorizeHints::ScalableForceKind> + ForceScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", + "Scalable vectorization is disabled."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "preferred", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "on", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."))); /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, - OptimizationRemarkEmitter &ORE) + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo *TTI) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), Force("vectorize.enable", FK_Undefined, HK_FORCE), @@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, if (VectorizerParams::isInterleaveForced()) Interleave.Value = VectorizerParams::VectorizationInterleave; + // If the metadata doesn't explicitly specify whether to enable scalable + // vectorization, then decide based on the following criteria (increasing + // level of priority): + // - Target default + // - Metadata width + // - Force option (always overrides) + if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { + if (TTI) + Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable + : SK_FixedWidthOnly; + + if (Width.Value) + // If the width is set, but the metadata says nothing about the scalable + // property, then assume it concerns only a fixed-width UserVF. + // If width is not set, the flag takes precedence. + Scalable.Value = SK_FixedWidthOnly; + } + + // If the flag is set to force any use of scalable vectors, override the loop + // hints. + if (ForceScalableVectorization.getValue() != + LoopVectorizeHints::SK_Unspecified) + Scalable.Value = ForceScalableVectorization.getValue(); + + // Scalable vectorization is disabled if no preference is specified. if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) - // If the width is set, but the metadata says nothing about the scalable - // property, then assume it concerns only a fixed-width UserVF. - // If width is not set, the flag takes precedence. - Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization; - else if (ScalableVectorization == SK_FixedWidthOnly) - // If the flag is set to disable any use of scalable vectors, override the - // loop hint. Scalable.Value = SK_FixedWidthOnly; if (IsVectorized.Value != 1) @@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath( })); } -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); if (!PN) @@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) { return Inductions.count(PN); } -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { +const InductionDescriptor * +LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { + if (!isInductionPhi(Phi)) + return nullptr; + auto &ID = getInductionVars().find(Phi)->second; + if (ID.getKind() == InductionDescriptor::IK_IntInduction || + ID.getKind() == InductionDescriptor::IK_FpInduction) + return &ID; + return nullptr; +} + +bool LoopVectorizationLegality::isCastedInductionVariable( + const Value *V) const { auto *Inst = dyn_cast<Instruction>(V); return (Inst && InductionCastsToIgnore.count(Inst)); } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { return isInductionPhi(V) || isCastedInductionVariable(V); } -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { +bool LoopVectorizationLegality::isFirstOrderRecurrence( + const PHINode *Phi) const { return FirstOrderRecurrences.count(Phi); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a7d6609f8c56..71eb39a18d2f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -45,16 +45,17 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands); + ArrayRef<VPValue *> Operands, DebugLoc DL) { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); if (BB) BB->insert(Instr, InsertPt); return Instr; } VPInstruction *createInstruction(unsigned Opcode, - std::initializer_list<VPValue *> Operands) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands)); + std::initializer_list<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); } public: @@ -123,30 +124,33 @@ public: /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, Instruction *Inst = nullptr) { - VPInstruction *NewVPInst = createInstruction(Opcode, Operands); + DebugLoc DL; + if (Inst) + DL = Inst->getDebugLoc(); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } - VPValue *createNaryOp(unsigned Opcode, - std::initializer_list<VPValue *> Operands, - Instruction *Inst = nullptr) { - return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst); + VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, Operands, DL); } - VPValue *createNot(VPValue *Operand) { - return createInstruction(VPInstruction::Not, {Operand}); + VPValue *createNot(VPValue *Operand, DebugLoc DL) { + return createInstruction(VPInstruction::Not, {Operand}, DL); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); } - VPValue *createOr(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); } - VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, + DebugLoc DL) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); } //===--------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5ca0adb4242c..4747f34fcc62 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,8 @@ class GeneratedRTChecks; namespace llvm { +AnalysisKey ShouldRunExtraVectorPasses::Key; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -506,8 +508,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, - VPValue *Def, VPValue *CastDef, + void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, VPValue *Def, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -534,7 +536,7 @@ public: /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. - bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction @@ -619,7 +621,7 @@ protected: /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, VPValue *Def, - VPValue *CastDef, VPTransformState &State); + VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -629,7 +631,6 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, - VPValue *CastDef, VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of @@ -639,29 +640,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// If there is a cast involved in the induction variable \p ID, which should - /// be ignored in the vectorized loop body, this function records the - /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the - /// cast. We had already proved that the casted Phi is equal to the uncasted - /// Phi in the vectorized loop (under a runtime guard), and therefore - /// there is no need to vectorize the cast - the same value can be used in the - /// vector loop for both the Phi and the cast. - /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, - /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - /// - /// \p EntryVal is the value from the original loop that maps to the vector - /// phi node and is used to distinguish what is the IV currently being - /// processed - original one (if \p EntryVal is a phi corresponding to the - /// original IV) or the "newly-created" one based on the proof mentioned above - /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the - /// latter case \p EntryVal is a TruncInst and we must not record anything for - /// that IV, but it's error-prone to expect callers of this routine to care - /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane = UINT_MAX); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -698,7 +676,8 @@ protected: /// flags, which can be found from the original scalar operations. Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const; + const InductionDescriptor &ID, + BasicBlock *VectorHeader) const; /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. Also @@ -1728,7 +1707,8 @@ private: /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + ElementCount UserVF, + bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -1741,7 +1721,8 @@ private: ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF); + const ElementCount &MaxSafeVF, + bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. @@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPValue *CastDef, - VPTransformState &State) { + Instruction *EntryVal, VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( } Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(VF, Start); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); @@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Type *StepType = Step->getType(); Value *RuntimeVF; if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); else - RuntimeVF = getRuntimeVF(Builder, StepType, VF); + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. @@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, - State, Part); LastInduction = cast<Instruction>( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); @@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane) { - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // This induction variable is not the phi from the original loop but the - // newly-created IV based on the proof that casted Phi is equal to the - // uncasted Phi in the vectorized loop (under a runtime guard possibly). It - // re-uses the same InductionDescriptor that original IV uses but we don't - // have to do any recording in this case - that is done when original IV is - // processed. - if (isa<TruncInst>(EntryVal)) - return; - - if (!CastDef) { - assert(ID.getCastInsts().empty() && - "there are casts for ID, but no CastDef"); - return; - } - assert(!ID.getCastInsts().empty() && - "there is a CastDef, but no casts for ID"); - // Only the first Cast instruction in the Casts vector is of interest. - // The rest of the Casts (if exist) have no uses outside the - // induction update chain itself. - if (Lane < UINT_MAX) - State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); - else - State.set(CastDef, VectorLoopVal, Part); -} - -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc, VPValue *Def, - VPValue *CastDef, +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, + const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); - - auto II = Legal->getInductionVars().find(IV); - assert(II != Legal->getInductionVars().end() && "IV is not an induction"); - - auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto &DL = EntryVal->getModule()->getDataLayout(); // Generate code for the induction step. Note that induction steps are // required to be loop-invariant @@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, if (PSE.getSE()->isSCEVable(IV->getType())) { SCEVExpander Exp(*PSE.getSE(), DL, "induction"); return Exp.expandCodeFor(Step, Step->getType(), - LoopVectorPreHeader->getTerminator()); + State.CFG.VectorPreHeader->getTerminator()); } return cast<SCEVUnknown>(Step)->getValue(); }; @@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, IV->getType()); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); + ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, + State.CFG.PrevBB); ScalarIV->setName("offset.idx"); } if (Trunc) { @@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) - StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); + StartIdx = + getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); else - StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); + StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, - State, Part); } }; @@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF.isZero() || VF.isScalar()) { + if (State.VF.isZero() || State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); return; } @@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); return; } @@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, @@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, - VPValue *Def, VPValue *CastDef, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF.isVector() && "VF should be greater than one"); + assert(State.VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. bool IsUniform = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); - unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!IsUniform && VF.isScalable()) { - VecIVTy = VectorType::get(ScalarIVTy, VF); - UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); - SplatStep = Builder.CreateVectorSplat(VF, Step); - SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + if (!IsUniform && State.VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, State.VF); + UnitStepVec = + Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); + SplatStep = Builder.CreateVectorSplat(State.VF, Step); + SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); } - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - if (!IsUniform && VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + if (!IsUniform && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(Def, Add, Part); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. @@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((VF.isScalable() || isa<Constant>(StartIdx)) && + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part, Lane); } } } @@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized // instruction could feed a poison value to the base address of the widen // load/store. - if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) Cloned->dropPoisonGeneratingFlags(); State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { - auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + for (auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; - if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) + VPValue *Operand = I.value(); + if (State.Plan->isUniformAfterVectorization(Operand)) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); - Cloned->setOperand(op, NewOp); + Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } addNewMetadata(Cloned, Instr); @@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, Value *InnerLoopVectorizer::emitTransformedIndex( IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const { + const InductionDescriptor &ID, BasicBlock *VectorHeader) const { SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); @@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex( }; // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=LoopVectorBody), because + // loop, choose the end of the vector loop header (=VectorHeader), because // the DomTree is not kept up-to-date for additional blocks generated in the // vector loop. By using the header as insertion point, we guarantee that the // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B]() { + auto GetInsertPoint = [this, &B, VectorHeader]() { BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); if (InsertBB != LoopVectorBody && - LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) - return LoopVectorBody->getTerminator(); + LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) + return VectorHeader->getTerminator(); return &*B.GetInsertPoint(); }; @@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues( CastInst::getCastOpcode(VectorTripCount, true, StepType, true); Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValue = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues( CRD = B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, if (MDNode *LID = OrigLoop->getLoopID()) L->setLoopID(LID); - LoopVectorizeHints Hints(L, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE, TTI); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); - Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); + Value *Escape = + emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { } } -bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { +bool InnerLoopVectorizer::useOrderedReductions( + const RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } @@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), + DL, II, State.CFG.PrevBB); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, VPIteration(Part, Lane)); } @@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { // Limit MaxScalableVF by the maximum safe dependence distance. Optional<unsigned> MaxVScale = TTI.getMaxVScale(); - if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (VScaleMax > 0) - MaxVScale = VScaleMax; - } + if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) + MaxVScale = + TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); if (!MaxScalableVF) @@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; } -FixedScalableVFPair -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { +FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( + unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeFixedVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeScalableVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF @@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we chose. // FIXME: The condition below pessimises the case for fixed-width vectors, @@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF) { + const ElementCount &MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( const auto TripCountEC = ElementCount::getFixed(ConstTripCount); if (ConstTripCount && ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && - isPowerOf2_32(ConstTripCount)) { - // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. If - // MaxVectorElementCount is scalable, we only fall back on a fixed VF when - // the TC is less than or equal to the known number of lanes. - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); - return TripCountEC; + (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { + // If loop trip count (TC) is known at compile time there is no point in + // choosing VF greater than TC (as done in the loop below). Select maximum + // power of two which doesn't exceed TC. + // If MaxVectorElementCount is scalable, we only fall back on a fixed VF + // when the TC is less than or equal to the known number of lanes. + auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " + "exceeding the constant trip count: " + << ClampedConstTripCount << "\n"); + return ElementCount::getFixed(ClampedConstTripCount); } ElementCount MaxVF = MaxVectorElementCount; @@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable( EstimatedWidthB *= VScale.getValue(); } - // When set to preferred, for now assume vscale may be larger than 1 (or the - // one being tuned for), so that scalable vectorization is slightly favorable - // over fixed-width vectorization. - if (Hints->isScalableVectorizationPreferred()) - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), + // so that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) @@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(PN)->second; if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), @@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); @@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && - Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa<ZExtInst>(Op0); - auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // Matched reduce(mul(ext, ext)) - InstructionCost ExtCost = - TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, - TTI::CastContextHint::None, CostKind, Op0); + Type *Op0Ty = Op0->getOperand(0)->getType(); + Type *Op1Ty = Op1->getOperand(0)->getType(); + Type *LargestOpTy = + Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty + : Op0Ty; + auto *ExtType = VectorType::get(LargestOpTy, VectorTy); + + // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of + // different sizes. We take the largest type as the ext to reduce, and add + // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). + InstructionCost ExtCost0 = TTI.getCastInstrCost( + Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op0); + InstructionCost ExtCost1 = TTI.getCastInstrCost( + Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op1); InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + InstructionCost ExtraExtCost = 0; + if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { + Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; + ExtraExtCost = TTI.getCastInstrCost( + ExtraExtOp->getOpcode(), ExtType, + VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), + TTI::CastContextHint::None, CostKind, ExtraExtOp); + } - if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + if (RedCost.isValid() && + (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) return I == RetI ? RedCost : 0; } else if (!match(I, m_ZExtOrSExt(m_Value()))) { // Matched reduce(mul()) @@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) + Pred = Cmp->getPredicate(); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, + CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + cast<CmpInst>(I)->getPredicate(), CostKind, + I); } case Instruction::Store: case Instruction::Load: { @@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { - RecurrenceDescriptor &RedDes = Reduction.second; + const RecurrenceDescriptor &RedDes = Reduction.second; const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. for (auto &Induction : Legal->getInductionVars()) { - InductionDescriptor &IndDes = Induction.second; + const InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } @@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { void LoopVectorizationCostModel::collectInLoopReductions() { for (auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Reduction.second; + const RecurrenceDescriptor &RdxDesc = Reduction.second; // We don't collect reductions that are type promoted (yet). if (RdxDesc.getRecurrenceType() != Phi->getType()) @@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); - - // We record as "Dead" also the type-casting instructions we had identified - // during induction analysis. We don't need any handling for them in the - // vectorized loop because we have proven that, under a proper runtime - // test guarding the vectorized loop, the value of the phi, and the casted - // value of the phi, are the same. The last instruction in this casting chain - // will get its scalar/vector/widened def from the scalar/vector/widened def - // of the respective phi node. Any other casts in the induction def-use chain - // have no other uses outside the phi update chain, and will be ignored. - InductionDescriptor &IndDes = Induction.second; - const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); - DeadInstructions.insert(Casts.begin(), Casts.end()); } } @@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask); + EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. // The condition is 'SrcMask && EdgeMask', which is equivalent to @@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, // EdgeMask is poison. Using 'and' here introduces undefined behavior. VPValue *False = Plan->getOrAddVPValue( ConstantInt::getFalse(BI->getCondition()->getType())); - EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + EdgeMask = + Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); } return EdgeMaskCache[Edge] = EdgeMask; @@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - // Create the block in mask as the first non-phi instruction in the block. - VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV. + // Start by constructing the desired canonical IV in the header block. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); + HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); IV = IVRecipe; } + + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { continue; } - BlockMask = Builder.createOr(BlockMask, EdgeMask); + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } return BlockMaskCache[BB] = BlockMask; @@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars().lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - assert(II.getStartValue() == + if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { + assert(II->getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); - return new VPWidenIntOrFpInductionRecipe( - Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); + return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); } return nullptr; @@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( if (LoopVectorizationPlanner::getDecisionAndClampRange( isOptimizableIVTruncate(I), Range)) { - InductionDescriptor II = - Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + auto *Phi = cast<PHINode>(I->getOperand(0)); + const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - Start, nullptr, I); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); } return nullptr; } @@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return VPBB; } LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); + + VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); + assert(SingleSucc && "VPBB must have a single successor when handling " + "predicated replication."); + VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); // Record predicated instructions for above packing optimizations. VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); VPBlockUtils::insertBlockAfter(RegSucc, Region); + VPBlockUtils::connectBlocks(RegSucc, SingleSucc); return RegSucc; } @@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, @@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + RecurKind Kind = + Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; RecipeBuilder.recordRecipeOf(Phi); @@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - auto Plan = std::make_unique<VPlan>(); + // Create initial VPlan skeleton, with separate header and latch blocks. + VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); + auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); + auto Plan = std::make_unique<VPlan>(TopRegion); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - VPBasicBlock *VPBB = nullptr; - VPBasicBlock *HeaderVPBB = nullptr; + VPBasicBlock *VPBB = HeaderVPBB; SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - if (VPBB) - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); - else { - auto *TopRegion = new VPRegionBlock("vector loop"); - TopRegion->setEntry(FirstVPBBForBB); - Plan->setEntry(TopRegion); - HeaderVPBB = FirstVPBBForBB; - } - VPBB = FirstVPBBForBB; + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. @@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( : ""); } } + + VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } + // Fold the last, empty block into its predecessor. + VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); + assert(VPBB && "expected to fold last (empty) block"); + // After here, VPBB should not be used. + VPBB = nullptr; + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); VPBlockUtils::connectBlocks(SplitPred, SinkRegion); VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - if (VPBB == SplitPred) - VPBB = SplitBlock; } } + VPlanTransforms::removeRedundantInductionCasts(*Plan); + // Now that sink-after is done, move induction recipes for optimized truncates // to the phi section of the header block. for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); + adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, + RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. @@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + // Fold Exit block into its predecessor if possible. + // TODO: Fold block earlier once all VPlan transforms properly maintain a + // VPBasicBlock as exit. + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, - Legal->getInductionVars(), - DeadInstructions, *PSE.getSE()); + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, + [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, + DeadInstructions, *PSE.getSE()); return Plan; } @@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) @@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { // exact, etc.). The control flow has been linearized and the // instruction is no longer guarded by the predicate, which could make // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.count(this) > 0) + if (State.MayGeneratePoisonRecipes.contains(this)) VecOp->dropPoisonGeneratingFlags(); } @@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), - getCastValue(), State); + State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), + getStartValue()->getLiveInIRValue(), + getTruncInst(), getVPValue(0), State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); + LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); LLVM_DEBUG( dbgs() << "LV: Loop hints:" @@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); } - if (!Result.MadeCFGChange) + + if (Result.MadeCFGChange) { + // Making CFG changes likely means a loop got vectorized. Indicate that + // extra simplification passes should be run. + // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only + // be run if runtime checks have been added. + AM.getResult<ShouldRunExtraVectorPasses>(F); + PA.preserve<ShouldRunExtraVectorPasses>(); + } else { PA.preserveSet<CFGAnalyses>(); + } return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 95061e9053fa..37ae13666f7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector<int> MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector<int> AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -812,6 +811,13 @@ public: /// ExtractElement, ExtractValue), which can be part of the graph. Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized + /// - just return ReorderIndices, otherwise check if the scalars can be + /// reordered and return the most optimal order. + /// \param TopToBottom If true, include the order of vectorized stores and + /// insertelement nodes, otherwise skip them. + Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -1010,18 +1016,25 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } - // The hard-coded scores listed here are not very important. When computing - // the scores of matching one sub-tree with another, we are basically - // counting the number of values that are matching. So even if all scores - // are set to 1, we would still get a decent matching result. + // The hard-coded scores listed here are not very important, though it shall + // be higher for better matches to improve the resulting cost. When + // computing the scores of matching one sub-tree with another, we are + // basically counting the number of values that are matching. So even if all + // scores are set to 1, we would still get a decent matching result. // However, sometimes we have to break ties. For example we may have to // choose between matching loads vs matching opcodes. This is what these - // scores are helping us with: they provide the order of preference. + // scores are helping us with: they provide the order of preference. Also, + // this is important if the scalar is externally used or used in another + // tree entry node in the different lane. /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). - static const int ScoreConsecutiveLoads = 3; + static const int ScoreConsecutiveLoads = 4; + /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). + static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. - static const int ScoreConsecutiveExtracts = 3; + static const int ScoreConsecutiveExtracts = 4; + /// ExtractElementInst from same vector and reversed indices. + static const int ScoreReversedExtracts = 3; /// Constants. static const int ScoreConstants = 2; /// Instructions with the same opcode. @@ -1041,7 +1054,10 @@ public: /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE) { + ScalarEvolution &SE, int NumLanes) { + if (V1 == V2) + return VLOperands::ScoreSplat; + auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { @@ -1051,8 +1067,17 @@ public: Optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (!Dist) + return VLOperands::ScoreFail; + // The distance is too large - still may be profitable to use masked + // loads/gathers. + if (std::abs(*Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + // This still will detect consecutive loads, but we might have "holes" + // in some cases. It is ok for non-power-2 vectorization and may produce + // better results. It should not affect current vectorization. + return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreReversedLoads; } auto *C1 = dyn_cast<Constant>(V1); @@ -1062,18 +1087,41 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - Value *EV; - ConstantInt *Ex1Idx, *Ex2Idx; - if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && - match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && - Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) - return VLOperands::ScoreConsecutiveExtracts; + Value *EV1; + ConstantInt *Ex1Idx; + if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { + // Undefs are always profitable for extractelements. + if (isa<UndefValue>(V2)) + return VLOperands::ScoreConsecutiveExtracts; + Value *EV2 = nullptr; + ConstantInt *Ex2Idx = nullptr; + if (match(V2, + m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), + m_Undef())))) { + // Undefs are always profitable for extractelements. + if (!Ex2Idx) + return VLOperands::ScoreConsecutiveExtracts; + if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) + return VLOperands::ScoreConsecutiveExtracts; + if (EV2 == EV1) { + int Idx1 = Ex1Idx->getZExtValue(); + int Idx2 = Ex2Idx->getZExtValue(); + int Dist = Idx2 - Idx1; + // The distance is too large - still may be profitable to use + // shuffles. + if (std::abs(Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts + : VLOperands::ScoreReversedExtracts; + } + } + } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { - if (I1 == I2) - return VLOperands::ScoreSplat; + if (I1->getParent() != I2->getParent()) + return VLOperands::ScoreFail; InstructionsState S = getSameOpcode({I1, I2}); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. @@ -1088,11 +1136,13 @@ public: return VLOperands::ScoreFail; } - /// Holds the values and their lane that are taking part in the look-ahead + /// Holds the values and their lanes that are taking part in the look-ahead /// score calculation. This is used in the external uses cost calculation. - SmallDenseMap<Value *, int> InLookAheadValues; + /// Need to hold all the lanes in case of splat/broadcast at least to + /// correctly check for the use in the different lane. + SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues; - /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// \returns the additional cost due to uses of \p LHS and \p RHS that are /// either external to the vectorized code, or require shuffling. int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { @@ -1116,22 +1166,30 @@ public: for (User *U : V->users()) { if (const TreeEntry *UserTE = R.getTreeEntry(U)) { // The user is in the VectorizableTree. Check if we need to insert. - auto It = llvm::find(UserTE->Scalars, U); - assert(It != UserTE->Scalars.end() && "U is in UserTE"); - int UserLn = std::distance(UserTE->Scalars.begin(), It); + int UserLn = UserTE->findLaneForValue(U); assert(UserLn >= 0 && "Bad lane"); - if (UserLn != Ln) + // If the values are different, check just the line of the current + // value. If the values are the same, need to add UserInDiffLaneCost + // only if UserLn does not match both line numbers. + if ((LHS.first != RHS.first && UserLn != Ln) || + (LHS.first == RHS.first && UserLn != LHS.second && + UserLn != RHS.second)) { Cost += UserInDiffLaneCost; + break; + } } else { // Check if the user is in the look-ahead code. auto It2 = InLookAheadValues.find(U); if (It2 != InLookAheadValues.end()) { // The user is in the look-ahead code. Check the lane. - if (It2->second != Ln) + if (!It2->getSecond().contains(Ln)) { Cost += UserInDiffLaneCost; + break; + } } else { // The user is neither in SLP tree nor in the look-ahead code. Cost += ExternalUseCost; + break; } } // Limit the number of visited uses to cap compilation time. @@ -1170,32 +1228,36 @@ public: Value *V1 = LHS.first; Value *V2 = RHS.first; // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = - std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - - getExternalUsesCost(LHS, RHS)); + int ShallowScoreAtThisLevel = std::max( + (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) - + getExternalUsesCost(LHS, RHS)); int Lane1 = LHS.second; int Lane2 = RHS.second; // If reached MaxLevel, // or if V1 and V2 are not instructions, // or if they are SPLAT, - // or if they are not consecutive, early return the current cost. + // or if they are not consecutive, + // or if profitable to vectorize loads or extractelements, early return + // the current cost. auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || ShallowScoreAtThisLevel == VLOperands::ScoreFail || - (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || + (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && + ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1] = Lane1; - InLookAheadValues[V2] = Lane2; + InLookAheadValues[V1].insert(Lane1); + InLookAheadValues[V2].insert(Lane2); // Contains the I2 operand indexes that got matched with I1 operands. SmallSet<unsigned, 4> Op2Used; - // Recursion towards the operands of I1 and I2. We are trying all possbile + // Recursion towards the operands of I1 and I2. We are trying all possible // operand pairs, and keeping track of the best score. for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); OpIdx1 != NumOperands1; ++OpIdx1) { @@ -1319,27 +1381,79 @@ public: return None; } - /// Helper for reorderOperandVecs. \Returns the lane that we should start - /// reordering from. This is the one which has the least number of operands - /// that can freely move about. + /// Helper for reorderOperandVecs. + /// \returns the lane that we should start reordering from. This is the one + /// which has the least number of operands that can freely move about or + /// less profitable because it already has the most optimal set of operands. unsigned getBestLaneToStartReordering() const { - unsigned BestLane = 0; unsigned Min = UINT_MAX; - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) { - unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); - if (NumFreeOps < Min) { - Min = NumFreeOps; - BestLane = Lane; + unsigned SameOpNumber = 0; + // std::pair<unsigned, unsigned> is used to implement a simple voting + // algorithm and choose the lane with the least number of operands that + // can freely move about or less profitable because it already has the + // most optimal set of operands. The first unsigned is a counter for + // voting, the second unsigned is the counter of lanes with instructions + // with same/alternate opcodes and same parent basic block. + MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; + // Try to be closer to the original results, if we have multiple lanes + // with same cost. If 2 lanes have the same cost, use the one with the + // lowest index. + for (int I = getNumLanes(); I > 0; --I) { + unsigned Lane = I - 1; + OperandsOrderData NumFreeOpsHash = + getMaxNumOperandsThatCanBeReordered(Lane); + // Compare the number of operands that can move and choose the one with + // the least number. + if (NumFreeOpsHash.NumOfAPOs < Min) { + Min = NumFreeOpsHash.NumOfAPOs; + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap.clear(); + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { + // Select the most optimal lane in terms of number of operands that + // should be moved around. + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { + ++HashMap[NumFreeOpsHash.Hash].first; + } + } + // Select the lane with the minimum counter. + unsigned BestLane = 0; + unsigned CntMin = UINT_MAX; + for (const auto &Data : reverse(HashMap)) { + if (Data.second.first < CntMin) { + CntMin = Data.second.first; + BestLane = Data.second.second; } } return BestLane; } - /// \Returns the maximum number of operands that are allowed to be reordered - /// for \p Lane. This is used as a heuristic for selecting the first lane to - /// start operand reordering. - unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + /// Data structure that helps to reorder operands. + struct OperandsOrderData { + /// The best number of operands with the same APOs, which can be + /// reordered. + unsigned NumOfAPOs = UINT_MAX; + /// Number of operands with the same/alternate instruction opcode and + /// parent. + unsigned NumOpsWithSameOpcodeParent = 0; + /// Hash for the actual operands ordering. + /// Used to count operands, actually their position id and opcode + /// value. It is used in the voting mechanism to find the lane with the + /// least number of operands that can freely move about or less profitable + /// because it already has the most optimal set of operands. Can be + /// replaced with SmallVector<unsigned> instead but hash code is faster + /// and requires less memory. + unsigned Hash = 0; + }; + /// \returns the maximum number of operands that are allowed to be reordered + /// for \p Lane and the number of compatible instructions(with the same + /// parent/opcode). This is used as a heuristic for selecting the first lane + /// to start operand reordering. + OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { unsigned CntTrue = 0; unsigned NumOperands = getNumOperands(); // Operands with the same APO can be reordered. We therefore need to count @@ -1348,11 +1462,45 @@ public: // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + // Operands with the same instruction opcode and parent are more + // profitable since we don't need to move them in many cases, with a high + // probability such lane already can be vectorized effectively. + bool AllUndefs = true; + unsigned NumOpsWithSameOpcodeParent = 0; + Instruction *OpcodeI = nullptr; + BasicBlock *Parent = nullptr; + unsigned Hash = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; - return std::max(CntTrue, CntFalse); + // Use Boyer-Moore majority voting for finding the majority opcode and + // the number of times it occurs. + if (auto *I = dyn_cast<Instruction>(OpData.V)) { + if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + I->getParent() != Parent) { + if (NumOpsWithSameOpcodeParent == 0) { + NumOpsWithSameOpcodeParent = 1; + OpcodeI = I; + Parent = I->getParent(); + } else { + --NumOpsWithSameOpcodeParent; + } + } else { + ++NumOpsWithSameOpcodeParent; + } + } + Hash = hash_combine( + Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); + AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); + } + if (AllUndefs) + return {}; + OperandsOrderData Data; + Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); + Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; + Data.Hash = Hash; + return Data; } /// Go through the instructions in VL and append their operands. @@ -1500,11 +1648,37 @@ public: ReorderingModes[OpIdx] = ReorderingMode::Failed; } + // Check that we don't have same operands. No need to reorder if operands + // are just perfect diamond or shuffled diamond match. Do not do it only + // for possible broadcasts or non-power of 2 number of scalars (just for + // now). + auto &&SkipReordering = [this]() { + SmallPtrSet<Value *, 4> UniqueValues; + ArrayRef<OperandData> Op0 = OpsVec.front(); + for (const OperandData &Data : Op0) + UniqueValues.insert(Data.V); + for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { + if (any_of(Op, [&UniqueValues](const OperandData &Data) { + return !UniqueValues.contains(Data.V); + })) + return false; + } + // TODO: Check if we can remove a check for non-power-2 number of + // scalars after full support of non-power-2 vectorization. + return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); + }; + // If the initial strategy fails for any of the operand indexes, then we // perform reordering again in a second pass. This helps avoid assigning // high priority to the failed strategy, and should improve reordering for // the non-failed operand indexes. for (int Pass = 0; Pass != 2; ++Pass) { + // Check if no need to reorder operands since they're are perfect or + // shuffled diamond match. + // Need to to do it to avoid extra external use cost counting for + // shuffled matches, which may cause regressions. + if (SkipReordering()) + break; // Skip the second pass if the first pass did not fail. bool StrategyFailed = false; // Mark all operand data as free to use. @@ -1792,9 +1966,10 @@ private: if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - Operands[OpIdx].resize(Scalars.size()); - for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = OpVL[Lane]; + assert(OpVL.size() <= Scalars.size() && + "Number of operands is greater than the number of scalars."); + Operands[OpIdx].resize(OpVL.size()); + copy(OpVL, Operands[OpIdx].begin()); } /// Set the operands of this bundle in their original order. @@ -1944,7 +2119,7 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Empty"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) + for (int ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; @@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return None; } +Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, + bool TopToBottom) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE.ReuseShuffleIndices.empty()) + return None; + if (TE.State == TreeEntry::Vectorize && + (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || + (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && + !TE.isAltShuffle()) + return TE.ReorderIndices; + if (TE.State == TreeEntry::NeedToGather) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE.getOpcode() == Instruction::ExtractElement && + !TE.isAltShuffle()) || + (all_of(TE.Scalars, + [](Value *V) { + return isa<UndefValue, ExtractElementInst>(V); + }) && + any_of(TE.Scalars, + [](Value *V) { return isa<ExtractElementInst>(V); }))) && + all_of(TE.Scalars, + [](Value *V) { + auto *EE = dyn_cast<ExtractElementInst>(V); + return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); + }) && + allSameType(TE.Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); + return CurrentOrder; + } + } + if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; @@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() { // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; // Find all reorderable nodes with the given VF. - // Currently the are vectorized loads,extracts + some gathering of extracts. + // Currently the are vectorized stores,loads,extracts + some gathering of + // extracts. for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( const std::unique_ptr<TreeEntry> &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst, - InsertElementInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/true)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const std::unique_ptr<TreeEntry> &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of(VectorizableTree, - [&TE](const std::unique_ptr<TreeEntry> &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && - Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - OrderedEntries.insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that every instruction appears once in this bundle. DenseMap<Value *, unsigned> UniquePositions; for (Value *V : VL) { + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back( + isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1 - : Res.first->second); + ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) UniqueValues.emplace_back(V); } @@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || + (UniquePositions.size() == 1 && all_of(UniqueValues, + [](Value *V) { + return isa<UndefValue>(V) || + !isConstant(V); + })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - // If any of the scalars is marked as a value that needs to stay scalar, then - // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, SmallVectorImpl<unsigned> &CurrentOrder) const { - Instruction *E0 = cast<Instruction>(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa<ExtractElementInst, ExtractValueInst>(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast<Instruction>(*It); + assert(all_of(VL, + [](Value *V) { + return isa<UndefValue, ExtractElementInst, ExtractValueInst>( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast<Instruction>(VL[I]); + auto *Inst = dyn_cast<Instruction>(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) + if (isa<UndefValue>(EE->getIndexOperand())) + continue; Optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } @@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || - llvm::all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + all_of(I->users(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); }); } @@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, for (auto *V : VL) { ++Idx; + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) + continue; + // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { AllConsecutive = true; @@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; + if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + } if (AllConsecutive) continue; @@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals](InstructionCost &Cost, - bool IsGather) { + VectorizedVals, E](InstructionCost &Cost) { DenseMap<Value *, int> ExtractVectorsTys; + SmallPtrSet<Value *, 4> CheckedExtracts; for (auto *V : VL) { if (isa<UndefValue>(V)) continue; @@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); Optional<unsigned> EEIdx = getExtractIndex(EE); @@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } return GatherCost; } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } if ((E->getOpcode() == Instruction::ExtractElement || all_of(E->Scalars, [](Value *V) { @@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost, /*IsGather=*/true); + AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return Cost; } } + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + assert(VecTy == FinalVecTy && + "No reused scalars expected for broadcast."); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( @@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } else { - AdjustExtractsCost(CommonCost, /*IsGather=*/false); + AdjustExtractsCost(CommonCost); } return CommonCost; } @@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, FoundOr = true; } // Check if the input is an extended load of the required or/shift expression. - Value *LoadPtr; + Value *Load; if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) return false; // Require that the total load bit width is a legal integer type. // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + Type *SrcTy = Load->getType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; @@ -9061,8 +9253,7 @@ private: "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, - ReductionOps.back()); + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } }; @@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +/// Compare two cmp instructions. If IsCompatibility is true, function returns +/// true if 2 cmps have same/swapped predicates and mos compatible corresponding +/// operands. If IsCompatibility is false, function implements strict weak +/// ordering relation between two cmp instructions, returning true if the first +/// instruction is "less" than the second, i.e. its predicate is less than the +/// predicate of the second or the operands IDs are less than the operands IDs +/// of the second cmp instruction. +template <bool IsCompatibility> +static bool compareCmp(Value *V, Value *V2, + function_ref<bool(Instruction *)> IsDeleted) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return !IsCompatibility; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + CmpInst::Predicate Pred1 = CI1->getPredicate(); + CmpInst::Predicate Pred2 = CI2->getPredicate(); + CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); + CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); + CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); + CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); + if (BasePred1 < BasePred2) + return !IsCompatibility; + if (BasePred1 > BasePred2) + return false; + // Compare operands. + bool LEPreds = Pred1 <= Pred2; + bool GEPreds = Pred1 >= Pred2; + for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { + auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); + auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); + if (Op1->getValueID() < Op2->getValueID()) + return !IsCompatibility; + if (Op1->getValueID() > Op2->getValueID()) + return false; + if (auto *I1 = dyn_cast<Instruction>(Op1)) + if (auto *I2 = dyn_cast<Instruction>(Op2)) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + } + return IsCompatibility; +} + bool SLPVectorizerPass::vectorizeSimpleInstructions( SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, bool AtTerminator) { @@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - // TODO: Add analysis on the operand opcodes (profitable to vectorize - // instructions with same/alternate opcodes/const values). auto &&CompareSorter = [&R](Value *V, Value *V2) { - auto *CI1 = cast<CmpInst>(V); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType()->getTypeID() < - CI2->getOperand(0)->getType()->getTypeID()) - return true; - if (CI1->getOperand(0)->getType()->getTypeID() > - CI2->getOperand(0)->getType()->getTypeID()) - return false; - return CI1->getPredicate() < CI2->getPredicate() || - (CI1->getPredicate() > CI2->getPredicate() && - CI1->getPredicate() < - CmpInst::getSwappedPredicate(CI2->getPredicate())); + return compareCmp<false>(V, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { if (V1 == V2) return true; - auto *CI1 = cast<CmpInst>(V1); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) - return false; - return CI1->getPredicate() == CI2->getPredicate() || - CI1->getPredicate() == - CmpInst::getSwappedPredicate(CI2->getPredicate()); + return compareCmp<true>(V1, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { unsigned EltSize = R.getVectorElementSize(V); @@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; + Optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = + !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); continue; + } if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); @@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); continue; + } if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) return true; if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return false; + return ConstOrder && *ConstOrder; }; auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { if (V1 == V2) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 44b5e1df0839..1d9e71663cd2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { assert((SplitAt == end() || SplitAt->getParent() == this) && "can only split at a position in the same block"); - SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(), - getSuccessors().end()); + SmallVector<VPBlockBase *, 2> Succs(successors()); // First, disconnect the current block from its successors. for (VPBlockBase *Succ : Succs) VPBlockUtils::disconnectBlocks(this, Succ); @@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; + Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); @@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << " "; Operand->printAsOperand(O, SlotTracker); } + + if (DL) { + O << ", !dbg "; + DL.print(O); + } } #endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 810dd5030f95..f4a1883e35d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -39,6 +39,7 @@ #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/InstructionCost.h" #include <algorithm> @@ -51,6 +52,7 @@ namespace llvm { class BasicBlock; class DominatorTree; +class InductionDescriptor; class InnerLoopVectorizer; class LoopInfo; class raw_ostream; @@ -500,6 +502,8 @@ public: const VPBlocksTy &getSuccessors() const { return Successors; } VPBlocksTy &getSuccessors() { return Successors; } + iterator_range<VPBlockBase **> successors() { return Successors; } + const VPBlocksTy &getPredecessors() const { return Predecessors; } VPBlocksTy &getPredecessors() { return Predecessors; } @@ -795,6 +799,7 @@ private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; FastMathFlags FMF; + DebugLoc DL; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -804,12 +809,14 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), + DL(DL) {} - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + DebugLoc DL = {}) + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +825,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands); + return new VPInstruction(Opcode, Operands, DL); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1003,21 +1010,22 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; + const InductionDescriptor &IndDesc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast, - TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) { - if (Trunc) - new VPValue(Trunc, this); - else - new VPValue(IV, this); + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), + IV(IV), IndDesc(IndDesc) {} + + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc, + TruncInst *Trunc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), + IV(IV), IndDesc(IndDesc) {} - if (Cast) - new VPValue(Cast, this); - } ~VPWidenIntOrFpInductionRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1038,13 +1046,6 @@ public: /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } - /// Returns the cast VPValue, if one is attached, or nullptr otherwise. - VPValue *getCastValue() { - if (getNumDefinedValues() != 2) - return nullptr; - return getVPValue(1); - } - /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { @@ -1053,6 +1054,9 @@ public: const TruncInst *getTruncInst() const { return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } }; /// A recipe for handling first order recurrences and pointer inductions. For @@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { /// operand. class VPReductionPHIRecipe : public VPWidenPHIRecipe { /// Descriptor for the reduction. - RecurrenceDescriptor &RdxDesc; + const RecurrenceDescriptor &RdxDesc; /// The phi is part of an in-loop reduction. bool IsInLoop; @@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe { public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. - VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, + VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, bool IsOrdered = false) : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), @@ -1210,7 +1214,9 @@ public: VPSlotTracker &SlotTracker) const override; #endif - RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; } + const RecurrenceDescriptor &getRecurrenceDescriptor() const { + return RdxDesc; + } /// Returns true, if the phi is part of an ordered reduction. bool isOrdered() const { return IsOrdered; } @@ -1340,13 +1346,13 @@ public: /// The Operands are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeBase, public VPValue { /// The recurrence decriptor for the reduction in question. - RecurrenceDescriptor *RdxDesc; + const RecurrenceDescriptor *RdxDesc; /// Pointer to the TTI, needed to create the target reduction const TargetTransformInfo *TTI; public: - VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, + VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, const TargetTransformInfo *TTI) : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}), VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) { @@ -2252,6 +2258,12 @@ public: return map_range(Operands, Fn); } + /// Returns true if \p VPV is uniform after vectorization. + bool isUniformAfterVectorization(VPValue *VPV) const { + auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef()); + return !VPV->getDef() || (RepR && RepR->isUniform()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2340,18 +2352,23 @@ public: /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr - /// has more than one successor, its conditional bit is propagated to \p - /// NewBlock. \p NewBlock must have neither successors nor predecessors. + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's + /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's + /// conditional bit is propagated to \p NewBlock. \p NewBlock must have + /// neither successors nor predecessors. static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { assert(NewBlock->getSuccessors().empty() && - "Can't insert new block with successors."); - // TODO: move successors from BlockPtr to NewBlock when this functionality - // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr - // already has successors. - BlockPtr->setOneSuccessor(NewBlock); - NewBlock->setPredecessors({BlockPtr}); + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); NewBlock->setParent(BlockPtr->getParent()); + SmallVector<VPBlockBase *> Succs(BlockPtr->successors()); + for (VPBlockBase *Succ : Succs) { + disconnectBlocks(BlockPtr, Succ); + connectBlocks(NewBlock, Succ); + } + NewBlock->setCondBit(BlockPtr->getCondBit()); + BlockPtr->setCondBit(nullptr); + connectBlocks(BlockPtr, NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p @@ -2394,6 +2411,31 @@ public: To->removePredecessor(From); } + /// Try to merge \p Block into its single predecessor, if \p Block is a + /// VPBasicBlock and its predecessor has a single successor. Returns a pointer + /// to the predecessor \p Block was merged into or nullptr otherwise. + static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) { + auto *VPBB = dyn_cast<VPBasicBlock>(Block); + auto *PredVPBB = + dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor()); + if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1) + return nullptr; + + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + R.moveBefore(*PredVPBB, PredVPBB->end()); + VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); + auto *ParentRegion = cast<VPRegionBlock>(Block->getParent()); + if (ParentRegion->getExit() == Block) + ParentRegion->setExit(PredVPBB); + SmallVector<VPBlockBase *> Successors(Block->successors()); + for (auto *Succ : Successors) { + VPBlockUtils::disconnectBlocks(Block, Succ); + VPBlockUtils::connectBlocks(PredVPBB, Succ); + } + delete Block; + return PredVPBB; + } + /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. static bool isBackEdge(const VPBlockBase *FromBlock, const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index ac3b3505dc34..86ecd6817873 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, case EdgeType::FALSE_EDGE: // CurrBB is the False successor of PredBB - compute not of CBV. - IntermediateVal = Builder.createNot(CBV); + IntermediateVal = Builder.createNot(CBV, {}); break; } // Now AND intermediate value with PredBB's block predicate if it has one. VPValue *BP = PredBB->getPredicate(); if (BP) - return Builder.createAnd(BP, IntermediateVal); + return Builder.createAnd(BP, IntermediateVal, {}); else return IntermediateVal; } @@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { Worklist.pop_front(); // Create an OR of these values. - VPValue *Or = Builder.createOr(LHS, RHS); + VPValue *Or = Builder.createOr(LHS, RHS, {}); // Push OR to the back of the worklist. Worklist.push_back(Or); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index c52c8a2229e8..9e19e172dea5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { return markFailed(); assert(CombinedOperands.size() > 0 && "Need more some operands"); - auto *VPI = new VPInstruction(Opcode, CombinedOperands); - VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); + auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); + auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); + VPI->setUnderlyingInstr(Inst); LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << *cast<VPInstruction>(Values[0]) << "\n"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ded5bc04beb5..d2daf558c2c5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -18,7 +18,8 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) { auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); @@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); - InductionDescriptor II = Inductions.lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { + VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); - SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(), - SinkCandidate->user_end()); + SmallVector<VPUser *, 4> Users(SinkCandidate->users()); for (auto *U : Users) { auto *UI = cast<VPRecipeBase>(U); if (UI->getParent() == SinkTo) @@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { VPValue *PredInst1 = cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); - SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(), - Phi1ToMoveV->user_end()); + SmallVector<VPUser *> Users(Phi1ToMoveV->users()); for (VPUser *U : Users) { auto *UI = dyn_cast<VPRecipeBase>(U); if (!UI || UI->getParent() != Then2) @@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { delete ToDelete; return Changed; } + +void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { + SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove; + for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) { + auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + if (!IV || IV->getTruncInst()) + continue; + + // Visit all casts connected to IV and in Casts. Collect them. + // remember them for removal. + auto &Casts = IV->getInductionDescriptor().getCastInsts(); + VPValue *FindMyCast = IV; + for (Instruction *IRCast : reverse(Casts)) { + VPRecipeBase *FoundUserCast = nullptr; + for (auto *U : FindMyCast->users()) { + auto *UserCast = cast<VPRecipeBase>(U); + if (UserCast->getNumDefinedValues() == 1 && + UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) { + FoundUserCast = UserCast; + break; + } + } + assert(FoundUserCast && "Missing a cast to remove"); + CastsToRemove.emplace_back(FoundUserCast, IV); + FindMyCast = FoundUserCast->getVPSingleValue(); + } + } + for (auto &E : CastsToRemove) { + E.first->getVPSingleValue()->replaceAllUsesWith(E.second); + E.first->eraseFromParent(); + } +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index c740f2c022da..a82a562d5e35 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -14,24 +14,37 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" namespace llvm { +class InductionDescriptor; class Instruction; +class PHINode; class ScalarEvolution; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. - static void VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, - SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE); + static void + VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, + SmallPtrSetImpl<Instruction *> &DeadInstructions, + ScalarEvolution &SE); static bool sinkScalarOperands(VPlan &Plan); static bool mergeReplicateRegions(VPlan &Plan); + + /// Remove redundant casts of inductions. + /// + /// Such redundant casts are casts of induction variables that can be ignored, + /// because we already proved that the casted phi is equal to the uncasted phi + /// in the vectorized loop. There is no need to vectorize the cast - the same + /// value can be used for both the phi and casts in the vector loop. + static void removeRedundantInductionCasts(VPlan &Plan); }; } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 6d6ea4eb30f1..7732d9367985 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { RecipeI++; } } + + const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry()); + const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry()); + if (!Entry) { + errs() << "VPlan entry block is not a VPBasicBlock\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); + if (!Exit) { + errs() << "VPlan exit block is not a VPBasicBlock\n"; + return false; + } + + for (const VPRegionBlock *Region : + VPBlockUtils::blocksOnly<const VPRegionBlock>( + depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>( + Plan.getEntry())))) { + if (Region->getEntry()->getNumPredecessors() != 0) { + errs() << "region entry block has predecessors\n"; + return false; + } + if (Region->getExit()->getNumSuccessors() != 0) { + errs() << "region exit block has successors\n"; + return false; + } + } return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 57b11e9414ba..c0aedab2fed0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!FixedVT) return false; - InstructionCost OriginalCost = TTI.getMemoryOpCost( - Instruction::Load, LI->getType(), Align(LI->getAlignment()), - LI->getPointerAddressSpace()); + InstructionCost OriginalCost = + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; |
