diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 161 |
1 files changed, 65 insertions, 96 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d11f4146b590..3290439ecd07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -632,13 +632,6 @@ protected: Instruction *EntryVal, VPValue *Def, VPTransformState &State); - /// Returns true if an instruction \p I should be scalarized instead of - /// vectorized for the chosen vectorization factor. - bool shouldScalarizeInstruction(Instruction *I) const; - - /// Returns true if we should generate a scalar version of \p IV. - bool needsScalarInduction(Instruction *IV) const; - /// Returns (and creates if needed) the original loop trip count. Value *getOrCreateTripCount(Loop *NewLoop); @@ -2479,21 +2472,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( VecInd->addIncoming(LastInduction, LoopVectorLatch); } -bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Cost->isScalarAfterVectorization(I, VF) || - Cost->isProfitableToScalarize(I, VF); -} - -bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { - if (shouldScalarizeInstruction(IV)) - return true; - auto isScalarInst = [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); - }; - return llvm::any_of(IV->users(), isScalarInst); -} - void InnerLoopVectorizer::widenIntOrFpInduction( PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, Value *CanonicalIV) { @@ -2549,27 +2527,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction( return ScalarIV; }; - // Create the vector values from the scalar IV, in the absence of creating a - // vector IV. - auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { - Value *Broadcasted = getBroadcastInstrs(ScalarIV); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx; - if (Step->getType()->isFloatingPointTy()) - StartIdx = - getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); - else - StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); - - Value *EntryPart = - getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), - State.VF, State.Builder); - State.set(Def, EntryPart, Part); - if (Trunc) - addMetadata(EntryPart, Trunc); - } - }; - // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(Builder); if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) @@ -2605,36 +2562,18 @@ void InnerLoopVectorizer::widenIntOrFpInduction( return; } - // Determine if we want a scalar version of the induction variable. This is - // true if the induction variable itself is not widened, or if it has at - // least one user in the loop that is not widened. - auto NeedsScalarIV = needsScalarInduction(EntryVal); - if (!NeedsScalarIV) { + // Create a new independent vector induction variable, if one is needed. + if (Def->needsVectorIV()) createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - return; - } - // Try to create a new independent vector induction variable. If we can't - // create the phi node, we will splat the scalar induction variable in each - // loop iteration. - if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - Value *ScalarIV = CreateScalarIV(Step); + if (Def->needsScalarIV()) { // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. + Value *ScalarIV = CreateScalarIV(Step); buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); - return; } - - // All IV users are scalar instructions, so only emit a scalar IV, not a - // vectorised IV. Except when we tail-fold, then the splat IV feeds the - // predicate used by the masked loads/stores. - Value *ScalarIV = CreateScalarIV(Step); - if (!Cost->isScalarEpilogueAllowed()) - CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -2663,17 +2602,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, } // Determine the number of scalars we need to generate for each unroll - // iteration. If EntryVal is uniform, we only need to generate the first - // lane. Otherwise, we generate all VF values. - bool IsUniform = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); + // iteration. + bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); + unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!IsUniform && State.VF.isScalable()) { + if (!FirstLaneOnly && State.VF.isScalable()) { VecIVTy = VectorType::get(ScalarIVTy, State.VF); UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); @@ -2684,7 +2621,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, for (unsigned Part = 0; Part < State.UF; ++Part) { Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - if (!IsUniform && State.VF.isScalable()) { + if (!FirstLaneOnly && State.VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) @@ -4565,7 +4502,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); + bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); assert((IsUniform || !State.VF.isScalable()) && "Cannot scalarize a scalable VF"); unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); @@ -5889,7 +5826,9 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) return false; - if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + // FIXME: We should consider changing the threshold for scalable + // vectors to take VScaleForTuning into account. + if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) return true; return false; } @@ -5940,29 +5879,21 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( return Result; } - auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (MainLoopVF.isScalable()) - LLVM_DEBUG( - dbgs() << "LEV: Epilogue vectorization using scalable vectors not " - "yet supported. Converting to fixed-width (VF=" - << FixedMainLoopVF << ") instead\n"); - - if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + if (!isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && - (Result.Width.getFixedValue() == 1 || - isMoreProfitable(NextVF, Result)) && + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width.getFixedValue() << "\n";); + << Result.Width << "\n";); return Result; } @@ -8546,16 +8477,54 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Mask, Consecutive, Reverse); } -VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, - ArrayRef<VPValue *> Operands) const { +static VPWidenIntOrFpInductionRecipe * +createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, + VPValue *Start, const InductionDescriptor &IndDesc, + LoopVectorizationCostModel &CM, Loop &OrigLoop, + VFRange &Range) { + // Returns true if an instruction \p I should be scalarized instead of + // vectorized for the chosen vectorization factor. + auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { + return CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF); + }; + + bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + // Returns true if we should generate a scalar version of \p IV. + if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) + return true; + auto isScalarInst = [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); + }; + return any_of(PhiOrTrunc->users(), isScalarInst); + }, + Range); + bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + return ShouldScalarizeInstruction(PhiOrTrunc, VF); + }, + Range); + assert(IndDesc.getStartValue() == + Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); + if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { + return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, + NeedsScalarIV, !NeedsScalarIVOnly); + } + assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, + !NeedsScalarIVOnly); +} + +VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { + // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. - if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { - assert(II->getStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); - } + if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) + return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, + Range); return nullptr; } @@ -8583,7 +8552,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); + return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); } return nullptr; } @@ -8865,7 +8834,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; |