src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-07-24 15:11:41 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-02-08 19:04:38 +0000
commit	fcaf7f8644a9988098ac6be2165bce3ea4786e91 (patch)
tree	08a554363df16b968a623d651c09d82a5a0b1c65 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent	753f127f3ace09432b2baeffd71a308760641a62 (diff)
parent	4b4fe385e49bd883fd183b5f21c1ea486c722e61 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

245

1 files changed, 99 insertions, 146 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0777a1385916..b887ea41676b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -92,6 +92,7 @@

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/TargetLibraryInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

+#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/BasicBlock.h"

@@ -473,7 +474,7 @@ public:

virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();

/// Widen a single call instruction within the innermost loop.

- void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,

+ void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,

VPTransformState &State);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.

@@ -1447,15 +1448,14 @@ public:

// through scalar predication or masked load/store or masked gather/scatter.

// \p VF is the vectorization factor that will be used to vectorize \p I.

// Superset of instructions that return true for isScalarWithPredication.

- bool isPredicatedInst(Instruction *I, ElementCount VF,

- bool IsKnownUniform = false) {

- // When we know the load is uniform and the original scalar loop was not

- // predicated we don't need to mark it as a predicated instruction. Any

- // vectorised blocks created when tail-folding are something artificial we

- // have introduced and we know there is always at least one active lane.

- // That's why we call Legal->blockNeedsPredication here because it doesn't

- // query tail-folding.

- if (IsKnownUniform && isa<LoadInst>(I) &&

+ bool isPredicatedInst(Instruction *I, ElementCount VF) {

+ // When we know the load's address is loop invariant and the instruction

+ // in the original scalar loop was unconditionally executed then we

+ // don't need to mark it as a predicated instruction. Tail folding may

+ // introduce additional predication, but we're guaranteed to always have

+ // at least one active lane. We call Legal->blockNeedsPredication here

+ // because it doesn't query tail-folding.

+ if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&

!Legal->blockNeedsPredication(I->getParent()))

return false;

if (!blockNeedsPredicationForAnyReason(I->getParent()))

@@ -1657,10 +1657,6 @@ private:

InstructionCost getScalarizationOverhead(Instruction *I,

ElementCount VF) const;

- /// Returns whether the instruction is a load or store and will be a emitted

- /// as a vector operation.

- bool isConsecutiveLoadOrStore(Instruction *I);

/// Returns true if an artificially high cost for emulated masked memrefs

/// should be used.

bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);

@@ -1919,10 +1915,13 @@ public:

auto DiffChecks = RtPtrChecking.getDiffChecks();

if (DiffChecks) {

+ Value *RuntimeVF = nullptr;

MemRuntimeCheckCond = addDiffRuntimeChecks(

MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,

- [VF](IRBuilderBase &B, unsigned Bits) {

- return getRuntimeVF(B, B.getIntNTy(Bits), VF);

+ [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {

+ if (!RuntimeVF)

+ RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);

+ return RuntimeVF;

IC);

} else {

@@ -2947,11 +2946,17 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// If tail is to be folded, vector loop takes care of all iterations.

Type *CountTy = Count->getType();

Value *CheckMinIters = Builder.getFalse();

- auto CreateStep = [&]() {

+ auto CreateStep = [&]() -> Value * {

// Create step with max(MinProTripCount, UF * VF).

- if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())

- return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);

- return createStepForVF(Builder, CountTy, VF, UF);

+ if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())

+ return createStepForVF(Builder, CountTy, VF, UF);

+ Value *MinProfTC =

+ createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);

+ if (!VF.isScalable())

+ return MinProfTC;

+ return Builder.CreateBinaryIntrinsic(

+ Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));

};

if (!Cost->foldTailByMasking())

@@ -4168,46 +4173,26 @@ bool InnerLoopVectorizer::useOrderedReductions(

return Cost->useOrderedReductions(RdxDesc);

}

-/// A helper function for checking whether an integer division-related

-/// instruction may divide by zero (in which case it must be predicated if

-/// executed conditionally in the scalar code).

-/// TODO: It may be worthwhile to generalize and check isKnownNonZero().

-/// Non-zero divisors that are non compile-time constants will not be

-/// converted into multiplication, so we will still end up scalarizing

-/// the division, but can do so w/o predication.

-static bool mayDivideByZero(Instruction &I) {

- assert((I.getOpcode() == Instruction::UDiv ||

- I.getOpcode() == Instruction::SDiv ||

- I.getOpcode() == Instruction::URem ||

- I.getOpcode() == Instruction::SRem) &&

- "Unexpected instruction");

- Value *Divisor = I.getOperand(1);

- auto *CInt = dyn_cast<ConstantInt>(Divisor);

- return !CInt || CInt->isZero();

-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,

+void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,

VPUser &ArgOperands,

VPTransformState &State) {

- assert(!isa<DbgInfoIntrinsic>(I) &&

+ assert(!isa<DbgInfoIntrinsic>(CI) &&

"DbgInfoIntrinsic should have been dropped during VPlan construction");

- State.setDebugLocFromInst(&I);

- Module *M = I.getParent()->getParent()->getParent();

- auto *CI = cast<CallInst>(&I);

+ State.setDebugLocFromInst(&CI);

SmallVector<Type *, 4> Tys;

- for (Value *ArgOperand : CI->args())

+ for (Value *ArgOperand : CI.args())

Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));

- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);

// The flag shows whether we use Intrinsic or a usual Call for vectorized

// version of the instruction.

// Is it beneficial to perform intrinsic call compared to lib call?

bool NeedToScalarize = false;

- InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);

- InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;

+ InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);

+ InstructionCost IntrinsicCost =

+ ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;

bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;

assert((UseVectorIntrinsic || !NeedToScalarize) &&

"Instruction should be scalarized elsewhere.");

@@ -4215,7 +4200,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,

"Either the intrinsic cost or vector call cost must be valid");

for (unsigned Part = 0; Part < UF; ++Part) {

- SmallVector<Type *, 2> TysForDecl = {CI->getType()};

+ SmallVector<Type *, 2> TysForDecl = {CI.getType()};

SmallVector<Value *, 4> Args;

for (auto &I : enumerate(ArgOperands.operands())) {

// Some intrinsics have a scalar argument - don't replace it with a

@@ -4235,27 +4220,28 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,

if (UseVectorIntrinsic) {

// Use vector version of the intrinsic.

if (VF.isVector())

- TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);

+ TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);

+ Module *M = State.Builder.GetInsertBlock()->getModule();

VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);

assert(VectorF && "Can't retrieve vector intrinsic.");

} else {

// Use vector version of the function call.

- const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);

+ const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);

#ifndef NDEBUG

- assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&

+ assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&

"Can't create vector function.");

#endif

- VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);

+ VectorF = VFDatabase(CI).getVectorizedFunction(Shape);

}

SmallVector<OperandBundleDef, 1> OpBundles;

- CI->getOperandBundlesAsDefs(OpBundles);

+ CI.getOperandBundlesAsDefs(OpBundles);

CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

if (isa<FPMathOperator>(V))

- V->copyFastMathFlags(CI);

+ V->copyFastMathFlags(&CI);

State.set(Def, V, Part);

- State.addMetadata(V, &I);

+ State.addMetadata(V, &CI);

}

@@ -4470,7 +4456,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication(

case Instruction::SDiv:

case Instruction::SRem:

case Instruction::URem:

- return mayDivideByZero(*I);

+ // TODO: We can use the loop-preheader as context point here and get

+ // context sensitive reasoning

+ return !isSafeToSpeculativelyExecute(I);

}

return false;

}

@@ -5406,7 +5394,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(

}

LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&

- ChosenFactor.Cost >= ScalarCost.Cost) dbgs()

+ !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()

<< "LV: Vectorization seems to be not beneficial, "

<< "but was forced by a user.\n");

LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");

@@ -6069,7 +6057,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,

// from moving "masked load/store" check from legality to cost model.

// Masked Load/Gather emulation was previously never allowed.

// Limited number of Masked Store/Scatter emulation was allowed.

- assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");

+ assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&

+ "Expecting a scalar emulated instruction");

return isa<LoadInst>(I) ||

(isa<StoreInst>(I) &&

NumPredStores > NumberOfStoresToPredicate);

@@ -6779,19 +6768,29 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

NumPredStores++;

if (Legal->isUniformMemOp(I)) {

- // TODO: Avoid replicating loads and stores instead of

- // relying on instcombine to remove them.

+ // Lowering story for uniform memory ops is currently a bit complicated.

+ // Scalarization works for everything which isn't a store with scalable

+ // VF. Fixed len VFs just scalarize and then DCE later; scalarization

+ // knows how to handle uniform-per-part values (i.e. the first lane

+ // in each unrolled VF) and can thus handle scalable loads too. For

+ // scalable stores, we use a scatter if legal. If not, we have no way

+ // to lower (currently) and thus have to abort vectorization.

+ if (isa<StoreInst>(&I) && VF.isScalable()) {

+ if (isLegalGatherOrScatter(&I, VF))

+ setWideningDecision(&I, VF, CM_GatherScatter,

+ getGatherScatterCost(&I, VF));

+ else

+ // Error case, abort vectorization

+ setWideningDecision(&I, VF, CM_Scalarize,

+ InstructionCost::getInvalid());

+ continue;

+ }

// Load: Scalar load + broadcast

// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract

- InstructionCost Cost;

- if (isa<StoreInst>(&I) && VF.isScalable() &&

- isLegalGatherOrScatter(&I, VF)) {

- Cost = getGatherScatterCost(&I, VF);

- setWideningDecision(&I, VF, CM_GatherScatter, Cost);

- } else {

- Cost = getUniformMemOpCost(&I, VF);

- setWideningDecision(&I, VF, CM_Scalarize, Cost);

- }

+ // TODO: Avoid replicating loads and stores instead of relying on

+ // instcombine to remove them.

+ setWideningDecision(&I, VF, CM_Scalarize,

+ getUniformMemOpCost(&I, VF));

continue;

}

@@ -7146,13 +7145,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

InstWidening Decision = getWideningDecision(I, Width);

assert(Decision != CM_Unknown &&

"CM decision should be taken at this point");

- if (Decision == CM_Scalarize) {

- if (VF.isScalable() && isa<StoreInst>(I))

- // We can't scalarize a scalable vector store (even a uniform one

- // currently), return an invalid cost so as to prevent vectorization.

- return InstructionCost::getInvalid();

+ if (getWideningCost(I, VF) == InstructionCost::getInvalid())

+ return InstructionCost::getInvalid();

+ if (Decision == CM_Scalarize)

Width = ElementCount::getFixed(1);

- }

}

VectorTy = ToVectorTy(getLoadStoreType(I), Width);

return getMemoryInstructionCost(I, VF);

@@ -7308,14 +7304,6 @@ Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,

} // end namespace llvm

-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {

- // Check if the pointer operand of a load or store instruction is

- // consecutive.

- if (auto *Ptr = getLoadStorePointerOperand(Inst))

- return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);

- return false;

void LoopVectorizationCostModel::collectValuesToIgnore() {

// Ignore ephemeral values.

CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

@@ -8370,7 +8358,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

Range);

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(

- [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },

+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },

Range);

// Even if the instruction is not marked as uniform, there are certain

@@ -8406,8 +8394,6 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),

IsUniform, IsPredicated);

- setRecipe(I, Recipe);

- Plan->addVPValue(I, Recipe);

// Find if I uses a predicated instruction. If so, it will use its scalar

// value. Avoid hoisting the insert-element which packs the scalar value into

@@ -8426,6 +8412,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

// Finalize the recipe for Instr, first if it is not predicated.

if (!IsPredicated) {

LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");

+ setRecipe(I, Recipe);

+ Plan->addVPValue(I, Recipe);

VPBB->appendRecipe(Recipe);

return VPBB;

}

@@ -8436,7 +8424,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

"predicated replication.");

VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);

// Record predicated instructions for above packing optimizations.

- VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);

+ VPBlockBase *Region = createReplicateRegion(Recipe, Plan);

VPBlockUtils::insertBlockAfter(Region, VPBB);

auto *RegSucc = new VPBasicBlock();

VPBlockUtils::insertBlockAfter(RegSucc, Region);

@@ -8444,11 +8432,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

return RegSucc;

}

-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(

- Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {

+VPRegionBlock *

+VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,

+ VPlanPtr &Plan) {

+ Instruction *Instr = PredRecipe->getUnderlyingInstr();

// Instructions marked for predication are replicated and placed under an

// if-then construct to prevent side-effects.

// Generate recipes to compute the block mask for this region.

VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);

@@ -8461,9 +8450,13 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(

? nullptr

: new VPPredInstPHIRecipe(PredRecipe);

if (PHIRecipe) {

- Plan->removeVPValueFor(Instr);

+ setRecipe(Instr, PHIRecipe);

Plan->addVPValue(Instr, PHIRecipe);

+ } else {

+ setRecipe(Instr, PredRecipe);

+ Plan->addVPValue(Instr, PredRecipe);

}

auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);

auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);

VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);

@@ -9564,12 +9557,19 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

return;

}

- // Generate scalar instances for all VF lanes of all UF parts, unless the

- // instruction is uniform inwhich case generate only the first lane for each

- // of the UF parts.

- unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();

- assert((!State.VF.isScalable() || IsUniform) &&

- "Can't scalarize a scalable vector");

+ if (IsUniform) {

+ // Uniform within VL means we need to generate lane 0 only for each

+ // unrolled copy.

+ for (unsigned Part = 0; Part < State.UF; ++Part)

+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,

+ VPIteration(Part, 0), IsPredicated,

+ State);

+ return;

+ }

+ // Generate scalar instances for all VF lanes of all UF parts.

+ assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");

+ const unsigned EndLane = State.VF.getKnownMinValue();

for (unsigned Part = 0; Part < State.UF; ++Part)

for (unsigned Lane = 0; Lane < EndLane; ++Lane)

State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,

@@ -9577,52 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

State);

}

-void VPPredInstPHIRecipe::execute(VPTransformState &State) {

- assert(State.Instance && "Predicated instruction PHI works per instance.");

- Instruction *ScalarPredInst =

- cast<Instruction>(State.get(getOperand(0), *State.Instance));

- BasicBlock *PredicatedBB = ScalarPredInst->getParent();

- BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();

- assert(PredicatingBB && "Predicated block has no single predecessor.");

- assert(isa<VPReplicateRecipe>(getOperand(0)) &&

- "operand must be VPReplicateRecipe");

- // By current pack/unpack logic we need to generate only a single phi node: if

- // a vector value for the predicated instruction exists at this point it means

- // the instruction has vector users only, and a phi for the vector value is

- // needed. In this case the recipe of the predicated instruction is marked to

- // also do that packing, thereby "hoisting" the insert-element sequence.

- // Otherwise, a phi node for the scalar value is needed.

- unsigned Part = State.Instance->Part;

- if (State.hasVectorValue(getOperand(0), Part)) {

- Value *VectorValue = State.get(getOperand(0), Part);

- InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);

- PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);

- VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.

- VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.

- if (State.hasVectorValue(this, Part))

- State.reset(this, VPhi, Part);

- else

- State.set(this, VPhi, Part);

- // NOTE: Currently we need to update the value of the operand, so the next

- // predicated iteration inserts its generated value in the correct vector.

- State.reset(getOperand(0), VPhi, Part);

- } else {

- Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();

- PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);

- Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),

- PredicatingBB);

- Phi->addIncoming(ScalarPredInst, PredicatedBB);

- if (State.hasScalarValue(this, *State.Instance))

- State.reset(this, Phi, *State.Instance);

- else

- State.set(this, Phi, *State.Instance);

- // NOTE: Currently we need to update the value of the operand, so the next

- // predicated iteration inserts its generated value in the correct vector.

- State.reset(getOperand(0), Phi, *State.Instance);

- }

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;

@@ -9793,8 +9747,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(

};

// 4) if the TTI hook indicates this is profitable, request predication.

- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,

- LVL.getLAI()))

+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))

return CM_ScalarEpilogueNotNeededUsePredicate;

return CM_ScalarEpilogueAllowed;