diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 188 |
1 files changed, 166 insertions, 22 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index cd48c0d57eb3..f923f0be6621 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -37,6 +37,11 @@ static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); +static cl::opt<bool> +AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, + cl::desc("Enable recognition of non-constant strided " + "pointer induction variables.")); + namespace llvm { cl::opt<bool> HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, @@ -447,8 +452,12 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { - const ValueToValueMap &Strides = - getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); + // FIXME: Currently, the set of symbolic strides is sometimes queried before + // it's collected. This happens from canVectorizeWithIfConvert, when the + // pointer is checked to reference consecutive elements suitable for a + // masked access. + const auto &Strides = + LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); Function *F = TheLoop->getHeader()->getParent(); bool OptForSize = F->hasOptSize() || @@ -462,11 +471,135 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, return 0; } -bool LoopVectorizationLegality::isUniform(Value *V) const { - return LAI->isUniform(V); +bool LoopVectorizationLegality::isInvariant(Value *V) const { + return LAI->isInvariant(V); +} + +namespace { +/// A rewriter to build the SCEVs for each of the VF lanes in the expected +/// vectorized loop, which can then be compared to detect their uniformity. This +/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) +/// with new AddRecs where the step is multiplied by StepMultiplier and Offset * +/// Step is added. Also checks if all sub-expressions are analyzable w.r.t. +/// uniformity. +class SCEVAddRecForUniformityRewriter + : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { + /// Multiplier to be applied to the step of AddRecs in TheLoop. + unsigned StepMultiplier; + + /// Offset to be added to the AddRecs in TheLoop. + unsigned Offset; + + /// Loop for which to rewrite AddRecsFor. + Loop *TheLoop; + + /// Is any sub-expressions not analyzable w.r.t. uniformity? + bool CannotAnalyze = false; + + bool canAnalyze() const { return !CannotAnalyze; } + +public: + SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, + unsigned Offset, Loop *TheLoop) + : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), + TheLoop(TheLoop) {} + + const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { + assert(Expr->getLoop() == TheLoop && + "addrec outside of TheLoop must be invariant and should have been " + "handled earlier"); + // Build a new AddRec by multiplying the step by StepMultiplier and + // incrementing the start by Offset * step. + Type *Ty = Expr->getType(); + auto *Step = Expr->getStepRecurrence(SE); + if (!SE.isLoopInvariant(Step, TheLoop)) { + CannotAnalyze = true; + return Expr; + } + auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); + auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); + auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); + return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); + } + + const SCEV *visit(const SCEV *S) { + if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop)) + return S; + return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); + } + + const SCEV *visitUnknown(const SCEVUnknown *S) { + if (SE.isLoopInvariant(S, TheLoop)) + return S; + // The value could vary across iterations. + CannotAnalyze = true; + return S; + } + + const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { + // Could not analyze the expression. + CannotAnalyze = true; + return S; + } + + static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, + unsigned StepMultiplier, unsigned Offset, + Loop *TheLoop) { + /// Bail out if the expression does not contain an UDiv expression. + /// Uniform values which are not loop invariant require operations to strip + /// out the lowest bits. For now just look for UDivs and use it to avoid + /// re-writing UDIV-free expressions for other lanes to limit compile time. + if (!SCEVExprContains(S, + [](const SCEV *S) { return isa<SCEVUDivExpr>(S); })) + return SE.getCouldNotCompute(); + + SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, + TheLoop); + const SCEV *Result = Rewriter.visit(S); + + if (Rewriter.canAnalyze()) + return Result; + return SE.getCouldNotCompute(); + } +}; + +} // namespace + +bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { + if (isInvariant(V)) + return true; + if (VF.isScalable()) + return false; + if (VF.isScalar()) + return true; + + // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is + // never considered uniform. + auto *SE = PSE.getSE(); + if (!SE->isSCEVable(V->getType())) + return false; + const SCEV *S = SE->getSCEV(V); + + // Rewrite AddRecs in TheLoop to step by VF and check if the expression for + // lane 0 matches the expressions for all other lanes. + unsigned FixedVF = VF.getKnownMinValue(); + const SCEV *FirstLaneExpr = + SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop); + if (isa<SCEVCouldNotCompute>(FirstLaneExpr)) + return false; + + // Make sure the expressions for lanes FixedVF-1..1 match the expression for + // lane 0. We check lanes in reverse order for compile-time, as frequently + // checking the last lane is sufficient to rule out uniformity. + return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) { + const SCEV *IthLaneExpr = + SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop); + return FirstLaneExpr == IthLaneExpr; + }); } -bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const { +bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, + ElementCount VF) const { Value *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) return false; @@ -474,7 +607,7 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const { // stores from being uniform. The current lowering simply doesn't handle // it; in particular, the cost model distinguishes scatter/gather from // scalar w/predication, and we currently rely on the scalar path. - return isUniform(Ptr) && !blockNeedsPredication(I.getParent()); + return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); } bool LoopVectorizationLegality::canVectorizeOuterLoop() { @@ -700,6 +833,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + // We prevent matching non-constant strided pointer IVS to preserve + // historical vectorizer behavior after a generalization of the + // IVDescriptor code. The intent is to remove this check, but we + // have to fix issues around code quality for such loops first. + auto isDisallowedStridedPointerInduction = + [](const InductionDescriptor &ID) { + if (AllowStridedPointerIVs) + return false; + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + ID.getConstIntStepValue() == nullptr; + }; + // TODO: Instead of recording the AllowedExit, it would be good to // record the complementary set: NotAllowedExit. These include (but may // not be limited to): @@ -715,14 +860,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // By recording these, we can then reason about ways to vectorize each // of these NotAllowedExit. InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && + !isDisallowedStridedPointerInduction(ID)) { addInductionPhi(Phi, ID, AllowedExit); Requirements->addExactFPMathInst(ID.getExactFPMathInst()); continue; } - if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, - SinkAfter, DT)) { + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { AllowedExit.insert(Phi); FixedOrderRecurrences.insert(Phi); continue; @@ -730,7 +875,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // As a last resort, coerce the PHI to a AddRec expression // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && + !isDisallowedStridedPointerInduction(ID)) { addInductionPhi(Phi, ID, AllowedExit); continue; } @@ -894,18 +1040,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } - // For fixed order recurrences, we use the previous value (incoming value from - // the latch) to check if it dominates all users of the recurrence. Bail out - // if we have to sink such an instruction for another recurrence, as the - // dominance requirement may not hold after sinking. - BasicBlock *LoopLatch = TheLoop->getLoopLatch(); - if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { - Instruction *V = - cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); - return SinkAfter.find(V) != SinkAfter.end(); - })) - return false; - // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. @@ -1124,6 +1258,16 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (isa<NoAliasScopeDeclInst>(&I)) continue; + // We can allow masked calls if there's at least one vector variant, even + // if we end up scalarizing due to the cost model calculations. + // TODO: Allow other calls if they have appropriate attributes... readonly + // and argmemonly? + if (CallInst *CI = dyn_cast<CallInst>(&I)) + if (VFDatabase::hasMaskedVariant(*CI)) { + MaskedOp.insert(CI); + continue; + } + // Loads are handled via masking (or speculated if safe to do so.) if (auto *LI = dyn_cast<LoadInst>(&I)) { if (!SafePtrs.count(LI->getPointerOperand())) |