aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp188
1 files changed, 166 insertions, 22 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cd48c0d57eb3..f923f0be6621 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -37,6 +37,11 @@ static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
+static cl::opt<bool>
+AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
+ cl::desc("Enable recognition of non-constant strided "
+ "pointer induction variables."));
+
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -447,8 +452,12 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
Value *Ptr) const {
- const ValueToValueMap &Strides =
- getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+ // FIXME: Currently, the set of symbolic strides is sometimes queried before
+ // it's collected. This happens from canVectorizeWithIfConvert, when the
+ // pointer is checked to reference consecutive elements suitable for a
+ // masked access.
+ const auto &Strides =
+ LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>();
Function *F = TheLoop->getHeader()->getParent();
bool OptForSize = F->hasOptSize() ||
@@ -462,11 +471,135 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
return 0;
}
-bool LoopVectorizationLegality::isUniform(Value *V) const {
- return LAI->isUniform(V);
+bool LoopVectorizationLegality::isInvariant(Value *V) const {
+ return LAI->isInvariant(V);
+}
+
+namespace {
+/// A rewriter to build the SCEVs for each of the VF lanes in the expected
+/// vectorized loop, which can then be compared to detect their uniformity. This
+/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
+/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
+/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
+/// uniformity.
+class SCEVAddRecForUniformityRewriter
+ : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
+ /// Multiplier to be applied to the step of AddRecs in TheLoop.
+ unsigned StepMultiplier;
+
+ /// Offset to be added to the AddRecs in TheLoop.
+ unsigned Offset;
+
+ /// Loop for which to rewrite AddRecsFor.
+ Loop *TheLoop;
+
+ /// Is any sub-expressions not analyzable w.r.t. uniformity?
+ bool CannotAnalyze = false;
+
+ bool canAnalyze() const { return !CannotAnalyze; }
+
+public:
+ SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
+ unsigned Offset, Loop *TheLoop)
+ : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
+ TheLoop(TheLoop) {}
+
+ const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+ assert(Expr->getLoop() == TheLoop &&
+ "addrec outside of TheLoop must be invariant and should have been "
+ "handled earlier");
+ // Build a new AddRec by multiplying the step by StepMultiplier and
+ // incrementing the start by Offset * step.
+ Type *Ty = Expr->getType();
+ auto *Step = Expr->getStepRecurrence(SE);
+ if (!SE.isLoopInvariant(Step, TheLoop)) {
+ CannotAnalyze = true;
+ return Expr;
+ }
+ auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
+ auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
+ auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
+ return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
+ }
+
+ const SCEV *visit(const SCEV *S) {
+ if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
+ return S;
+ return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
+ }
+
+ const SCEV *visitUnknown(const SCEVUnknown *S) {
+ if (SE.isLoopInvariant(S, TheLoop))
+ return S;
+ // The value could vary across iterations.
+ CannotAnalyze = true;
+ return S;
+ }
+
+ const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
+ // Could not analyze the expression.
+ CannotAnalyze = true;
+ return S;
+ }
+
+ static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
+ unsigned StepMultiplier, unsigned Offset,
+ Loop *TheLoop) {
+ /// Bail out if the expression does not contain an UDiv expression.
+ /// Uniform values which are not loop invariant require operations to strip
+ /// out the lowest bits. For now just look for UDivs and use it to avoid
+ /// re-writing UDIV-free expressions for other lanes to limit compile time.
+ if (!SCEVExprContains(S,
+ [](const SCEV *S) { return isa<SCEVUDivExpr>(S); }))
+ return SE.getCouldNotCompute();
+
+ SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
+ TheLoop);
+ const SCEV *Result = Rewriter.visit(S);
+
+ if (Rewriter.canAnalyze())
+ return Result;
+ return SE.getCouldNotCompute();
+ }
+};
+
+} // namespace
+
+bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
+ if (isInvariant(V))
+ return true;
+ if (VF.isScalable())
+ return false;
+ if (VF.isScalar())
+ return true;
+
+ // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
+ // never considered uniform.
+ auto *SE = PSE.getSE();
+ if (!SE->isSCEVable(V->getType()))
+ return false;
+ const SCEV *S = SE->getSCEV(V);
+
+ // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
+ // lane 0 matches the expressions for all other lanes.
+ unsigned FixedVF = VF.getKnownMinValue();
+ const SCEV *FirstLaneExpr =
+ SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
+ if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
+ return false;
+
+ // Make sure the expressions for lanes FixedVF-1..1 match the expression for
+ // lane 0. We check lanes in reverse order for compile-time, as frequently
+ // checking the last lane is sufficient to rule out uniformity.
+ return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) {
+ const SCEV *IthLaneExpr =
+ SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop);
+ return FirstLaneExpr == IthLaneExpr;
+ });
}
-bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
+bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
+ ElementCount VF) const {
Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
return false;
@@ -474,7 +607,7 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
// stores from being uniform. The current lowering simply doesn't handle
// it; in particular, the cost model distinguishes scatter/gather from
// scalar w/predication, and we currently rely on the scalar path.
- return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
+ return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
}
bool LoopVectorizationLegality::canVectorizeOuterLoop() {
@@ -700,6 +833,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
+ // We prevent matching non-constant strided pointer IVS to preserve
+ // historical vectorizer behavior after a generalization of the
+ // IVDescriptor code. The intent is to remove this check, but we
+ // have to fix issues around code quality for such loops first.
+ auto isDisallowedStridedPointerInduction =
+ [](const InductionDescriptor &ID) {
+ if (AllowStridedPointerIVs)
+ return false;
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ ID.getConstIntStepValue() == nullptr;
+ };
+
// TODO: Instead of recording the AllowedExit, it would be good to
// record the complementary set: NotAllowedExit. These include (but may
// not be limited to):
@@ -715,14 +860,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// By recording these, we can then reason about ways to vectorize each
// of these NotAllowedExit.
InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
+ !isDisallowedStridedPointerInduction(ID)) {
addInductionPhi(Phi, ID, AllowedExit);
Requirements->addExactFPMathInst(ID.getExactFPMathInst());
continue;
}
- if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop,
- SinkAfter, DT)) {
+ if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
AllowedExit.insert(Phi);
FixedOrderRecurrences.insert(Phi);
continue;
@@ -730,7 +875,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// As a last resort, coerce the PHI to a AddRec expression
// and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
+ !isDisallowedStridedPointerInduction(ID)) {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
@@ -894,18 +1040,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
- // For fixed order recurrences, we use the previous value (incoming value from
- // the latch) to check if it dominates all users of the recurrence. Bail out
- // if we have to sink such an instruction for another recurrence, as the
- // dominance requirement may not hold after sinking.
- BasicBlock *LoopLatch = TheLoop->getLoopLatch();
- if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
- Instruction *V =
- cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
- return SinkAfter.find(V) != SinkAfter.end();
- }))
- return false;
-
// Now we know the widest induction type, check if our found induction
// is the same size. If it's not, unset it here and InnerLoopVectorizer
// will create another.
@@ -1124,6 +1258,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(
if (isa<NoAliasScopeDeclInst>(&I))
continue;
+ // We can allow masked calls if there's at least one vector variant, even
+ // if we end up scalarizing due to the cost model calculations.
+ // TODO: Allow other calls if they have appropriate attributes... readonly
+ // and argmemonly?
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (VFDatabase::hasMaskedVariant(*CI)) {
+ MaskedOp.insert(CI);
+ continue;
+ }
+
// Loads are handled via masking (or speculated if safe to do so.)
if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (!SafePtrs.count(LI->getPointerOperand()))