aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-24 15:11:41 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-08 19:04:38 +0000
commitfcaf7f8644a9988098ac6be2165bce3ea4786e91 (patch)
tree08a554363df16b968a623d651c09d82a5a0b1c65 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent753f127f3ace09432b2baeffd71a308760641a62 (diff)
parent4b4fe385e49bd883fd183b5f21c1ea486c722e61 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp245
1 files changed, 99 insertions, 146 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0777a1385916..b887ea41676b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -92,6 +92,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -473,7 +474,7 @@ public:
virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
/// Widen a single call instruction within the innermost loop.
- void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
+ void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -1447,15 +1448,14 @@ public:
// through scalar predication or masked load/store or masked gather/scatter.
// \p VF is the vectorization factor that will be used to vectorize \p I.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, ElementCount VF,
- bool IsKnownUniform = false) {
- // When we know the load is uniform and the original scalar loop was not
- // predicated we don't need to mark it as a predicated instruction. Any
- // vectorised blocks created when tail-folding are something artificial we
- // have introduced and we know there is always at least one active lane.
- // That's why we call Legal->blockNeedsPredication here because it doesn't
- // query tail-folding.
- if (IsKnownUniform && isa<LoadInst>(I) &&
+ bool isPredicatedInst(Instruction *I, ElementCount VF) {
+ // When we know the load's address is loop invariant and the instruction
+ // in the original scalar loop was unconditionally executed then we
+ // don't need to mark it as a predicated instruction. Tail folding may
+ // introduce additional predication, but we're guaranteed to always have
+ // at least one active lane. We call Legal->blockNeedsPredication here
+ // because it doesn't query tail-folding.
+ if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
!Legal->blockNeedsPredication(I->getParent()))
return false;
if (!blockNeedsPredicationForAnyReason(I->getParent()))
@@ -1657,10 +1657,6 @@ private:
InstructionCost getScalarizationOverhead(Instruction *I,
ElementCount VF) const;
- /// Returns whether the instruction is a load or store and will be a emitted
- /// as a vector operation.
- bool isConsecutiveLoadOrStore(Instruction *I);
-
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
@@ -1919,10 +1915,13 @@ public:
auto DiffChecks = RtPtrChecking.getDiffChecks();
if (DiffChecks) {
+ Value *RuntimeVF = nullptr;
MemRuntimeCheckCond = addDiffRuntimeChecks(
MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
- [VF](IRBuilderBase &B, unsigned Bits) {
- return getRuntimeVF(B, B.getIntNTy(Bits), VF);
+ [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
+ if (!RuntimeVF)
+ RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
+ return RuntimeVF;
},
IC);
} else {
@@ -2947,11 +2946,17 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// If tail is to be folded, vector loop takes care of all iterations.
Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
- auto CreateStep = [&]() {
+ auto CreateStep = [&]() -> Value * {
// Create step with max(MinProTripCount, UF * VF).
- if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
- return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
- return createStepForVF(Builder, CountTy, VF, UF);
+ if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
+ return createStepForVF(Builder, CountTy, VF, UF);
+
+ Value *MinProfTC =
+ createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+ if (!VF.isScalable())
+ return MinProfTC;
+ return Builder.CreateBinaryIntrinsic(
+ Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
};
if (!Cost->foldTailByMasking())
@@ -4168,46 +4173,26 @@ bool InnerLoopVectorizer::useOrderedReductions(
return Cost->useOrderedReductions(RdxDesc);
}
-/// A helper function for checking whether an integer division-related
-/// instruction may divide by zero (in which case it must be predicated if
-/// executed conditionally in the scalar code).
-/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
-/// Non-zero divisors that are non compile-time constants will not be
-/// converted into multiplication, so we will still end up scalarizing
-/// the division, but can do so w/o predication.
-static bool mayDivideByZero(Instruction &I) {
- assert((I.getOpcode() == Instruction::UDiv ||
- I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::URem ||
- I.getOpcode() == Instruction::SRem) &&
- "Unexpected instruction");
- Value *Divisor = I.getOperand(1);
- auto *CInt = dyn_cast<ConstantInt>(Divisor);
- return !CInt || CInt->isZero();
-}
-
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
VPUser &ArgOperands,
VPTransformState &State) {
- assert(!isa<DbgInfoIntrinsic>(I) &&
+ assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- State.setDebugLocFromInst(&I);
-
- Module *M = I.getParent()->getParent()->getParent();
- auto *CI = cast<CallInst>(&I);
+ State.setDebugLocFromInst(&CI);
SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI->args())
+ for (Value *ArgOperand : CI.args())
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
- InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
- InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+ InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost =
+ ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
@@ -4215,7 +4200,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
"Either the intrinsic cost or vector call cost must be valid");
for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Type *, 2> TysForDecl = {CI->getType()};
+ SmallVector<Type *, 2> TysForDecl = {CI.getType()};
SmallVector<Value *, 4> Args;
for (auto &I : enumerate(ArgOperands.operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
@@ -4235,27 +4220,28 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
if (UseVectorIntrinsic) {
// Use vector version of the intrinsic.
if (VF.isVector())
- TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
+ Module *M = State.Builder.GetInsertBlock()->getModule();
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
// Use vector version of the function call.
- const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+ const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
#ifndef NDEBUG
- assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+ assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.");
#endif
- VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
}
SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
+ CI.getOperandBundlesAsDefs(OpBundles);
CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
if (isa<FPMathOperator>(V))
- V->copyFastMathFlags(CI);
+ V->copyFastMathFlags(&CI);
State.set(Def, V, Part);
- State.addMetadata(V, &I);
+ State.addMetadata(V, &CI);
}
}
@@ -4470,7 +4456,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- return mayDivideByZero(*I);
+ // TODO: We can use the loop-preheader as context point here and get
+ // context sensitive reasoning
+ return !isSafeToSpeculativelyExecute(I);
}
return false;
}
@@ -5406,7 +5394,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
}
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
- ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
+ !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
@@ -6069,7 +6057,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
+ assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
+ "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -6779,19 +6768,29 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
NumPredStores++;
if (Legal->isUniformMemOp(I)) {
- // TODO: Avoid replicating loads and stores instead of
- // relying on instcombine to remove them.
+ // Lowering story for uniform memory ops is currently a bit complicated.
+ // Scalarization works for everything which isn't a store with scalable
+ // VF. Fixed len VFs just scalarize and then DCE later; scalarization
+ // knows how to handle uniform-per-part values (i.e. the first lane
+ // in each unrolled VF) and can thus handle scalable loads too. For
+ // scalable stores, we use a scatter if legal. If not, we have no way
+ // to lower (currently) and thus have to abort vectorization.
+ if (isa<StoreInst>(&I) && VF.isScalable()) {
+ if (isLegalGatherOrScatter(&I, VF))
+ setWideningDecision(&I, VF, CM_GatherScatter,
+ getGatherScatterCost(&I, VF));
+ else
+ // Error case, abort vectorization
+ setWideningDecision(&I, VF, CM_Scalarize,
+ InstructionCost::getInvalid());
+ continue;
+ }
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- InstructionCost Cost;
- if (isa<StoreInst>(&I) && VF.isScalable() &&
- isLegalGatherOrScatter(&I, VF)) {
- Cost = getGatherScatterCost(&I, VF);
- setWideningDecision(&I, VF, CM_GatherScatter, Cost);
- } else {
- Cost = getUniformMemOpCost(&I, VF);
- setWideningDecision(&I, VF, CM_Scalarize, Cost);
- }
+ // TODO: Avoid replicating loads and stores instead of relying on
+ // instcombine to remove them.
+ setWideningDecision(&I, VF, CM_Scalarize,
+ getUniformMemOpCost(&I, VF));
continue;
}
@@ -7146,13 +7145,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
InstWidening Decision = getWideningDecision(I, Width);
assert(Decision != CM_Unknown &&
"CM decision should be taken at this point");
- if (Decision == CM_Scalarize) {
- if (VF.isScalable() && isa<StoreInst>(I))
- // We can't scalarize a scalable vector store (even a uniform one
- // currently), return an invalid cost so as to prevent vectorization.
- return InstructionCost::getInvalid();
+ if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+ return InstructionCost::getInvalid();
+ if (Decision == CM_Scalarize)
Width = ElementCount::getFixed(1);
- }
}
VectorTy = ToVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
@@ -7308,14 +7304,6 @@ Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
} // end namespace llvm
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
- // Check if the pointer operand of a load or store instruction is
- // consecutive.
- if (auto *Ptr = getLoadStorePointerOperand(Inst))
- return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
- return false;
-}
-
void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
@@ -8370,7 +8358,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
Range);
// Even if the instruction is not marked as uniform, there are certain
@@ -8406,8 +8394,6 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
IsUniform, IsPredicated);
- setRecipe(I, Recipe);
- Plan->addVPValue(I, Recipe);
// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
@@ -8426,6 +8412,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ setRecipe(I, Recipe);
+ Plan->addVPValue(I, Recipe);
VPBB->appendRecipe(Recipe);
return VPBB;
}
@@ -8436,7 +8424,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
"predicated replication.");
VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
// Record predicated instructions for above packing optimizations.
- VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+ VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
VPBlockUtils::insertBlockAfter(RegSucc, Region);
@@ -8444,11 +8432,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
return RegSucc;
}
-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
- Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {
+VPRegionBlock *
+VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
+ VPlanPtr &Plan) {
+ Instruction *Instr = PredRecipe->getUnderlyingInstr();
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.
-
// Generate recipes to compute the block mask for this region.
VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
@@ -8461,9 +8450,13 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
? nullptr
: new VPPredInstPHIRecipe(PredRecipe);
if (PHIRecipe) {
- Plan->removeVPValueFor(Instr);
+ setRecipe(Instr, PHIRecipe);
Plan->addVPValue(Instr, PHIRecipe);
+ } else {
+ setRecipe(Instr, PredRecipe);
+ Plan->addVPValue(Instr, PredRecipe);
}
+
auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
@@ -9564,12 +9557,19 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
return;
}
- // Generate scalar instances for all VF lanes of all UF parts, unless the
- // instruction is uniform inwhich case generate only the first lane for each
- // of the UF parts.
- unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
- assert((!State.VF.isScalable() || IsUniform) &&
- "Can't scalarize a scalable vector");
+ if (IsUniform) {
+ // Uniform within VL means we need to generate lane 0 only for each
+ // unrolled copy.
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
+ VPIteration(Part, 0), IsPredicated,
+ State);
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts.
+ assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
+ const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
@@ -9577,52 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State);
}
-void VPPredInstPHIRecipe::execute(VPTransformState &State) {
- assert(State.Instance && "Predicated instruction PHI works per instance.");
- Instruction *ScalarPredInst =
- cast<Instruction>(State.get(getOperand(0), *State.Instance));
- BasicBlock *PredicatedBB = ScalarPredInst->getParent();
- BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
- assert(PredicatingBB && "Predicated block has no single predecessor.");
- assert(isa<VPReplicateRecipe>(getOperand(0)) &&
- "operand must be VPReplicateRecipe");
-
- // By current pack/unpack logic we need to generate only a single phi node: if
- // a vector value for the predicated instruction exists at this point it means
- // the instruction has vector users only, and a phi for the vector value is
- // needed. In this case the recipe of the predicated instruction is marked to
- // also do that packing, thereby "hoisting" the insert-element sequence.
- // Otherwise, a phi node for the scalar value is needed.
- unsigned Part = State.Instance->Part;
- if (State.hasVectorValue(getOperand(0), Part)) {
- Value *VectorValue = State.get(getOperand(0), Part);
- InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
- PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
- VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
- VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
- if (State.hasVectorValue(this, Part))
- State.reset(this, VPhi, Part);
- else
- State.set(this, VPhi, Part);
- // NOTE: Currently we need to update the value of the operand, so the next
- // predicated iteration inserts its generated value in the correct vector.
- State.reset(getOperand(0), VPhi, Part);
- } else {
- Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
- PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
- Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
- PredicatingBB);
- Phi->addIncoming(ScalarPredInst, PredicatedBB);
- if (State.hasScalarValue(this, *State.Instance))
- State.reset(this, Phi, *State.Instance);
- else
- State.set(this, Phi, *State.Instance);
- // NOTE: Currently we need to update the value of the operand, so the next
- // predicated iteration inserts its generated value in the correct vector.
- State.reset(getOperand(0), Phi, *State.Instance);
- }
-}
-
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
@@ -9793,8 +9747,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};
// 4) if the TTI hook indicates this is profitable, request predication.
- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
- LVL.getLAI()))
+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;