aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
commit344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
treef0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parentb60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp3868
1 files changed, 2299 insertions, 1569 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea0d7673edf6..f24ae6b100d5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -69,8 +69,8 @@
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -117,6 +117,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
@@ -198,6 +199,11 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -326,6 +332,11 @@ static cl::opt<bool>
cl::desc("Prefer in-loop vector reductions, "
"overriding the targets preference."));
+cl::opt<bool> EnableStrictReductions(
+ "enable-strict-reductions", cl::init(false), cl::Hidden,
+ cl::desc("Enable the vectorisation of loops with in-order (strict) "
+ "FP reductions"));
+
static cl::opt<bool> PreferPredicatedReductionSelect(
"prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
cl::desc(
@@ -361,30 +372,17 @@ cl::opt<bool> llvm::EnableLoopVectorization(
"vectorize-loops", cl::init(true), cl::Hidden,
cl::desc("Run the Loop vectorization passes"));
-/// A helper function that returns the type of loaded or stored value.
-static Type *getMemInstValueType(Value *I) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
- "Expected Load or Store instruction");
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getType();
- return cast<StoreInst>(I)->getValueOperand()->getType();
-}
+cl::opt<bool> PrintVPlansInDotFormat(
+ "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
+ cl::desc("Use dot format instead of plain text when dumping VPlans"));
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
-/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
- // Determine if an array of VF elements of type Ty is "bitcast compatible"
- // with a <VF x Ty> vector.
- if (VF.isVector()) {
- auto *VectorTy = VectorType::get(Ty, VF);
- return TypeSize::get(VF.getKnownMinValue() *
- DL.getTypeAllocSize(Ty).getFixedValue(),
- VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
- }
-
- // If the vectorization factor is one, we just check if an array of type Ty
- // requires padding between elements.
+/// element of the corresponding vector type.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
+ // Determine if an array of N elements of type Ty is "bitcast compatible"
+ // with a <N x Ty> vector.
+ // This is only true if there is no padding between the array elements.
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
@@ -396,19 +394,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
/// we always assume predicated blocks have a 50% chance of executing.
static unsigned getReciprocalPredBlockProb() { return 2; }
-/// A helper function that adds a 'fast' flag to floating-point operations.
-static Value *addFastMathFlag(Value *V) {
- if (isa<FPMathOperator>(V))
- cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
- return V;
-}
-
-static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
- if (isa<FPMathOperator>(V))
- cast<Instruction>(V)->setFastMathFlags(FMF);
- return V;
-}
-
/// A helper function that returns an integer or floating-point constant with
/// value C.
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
@@ -439,6 +424,9 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
return None;
}
+// Forward declare GeneratedRTChecks.
+class GeneratedRTChecks;
+
namespace llvm {
/// InnerLoopVectorizer vectorizes loops which contain only one basic
@@ -464,12 +452,11 @@ public:
OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
- Builder(PSE.getSE()->getContext()),
- VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
- BFI(BFI), PSI(PSI) {
+ Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
+ PSI(PSI), RTChecks(RTChecks) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -500,7 +487,7 @@ public:
bool InvariantCond, VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
- void fixVectorizedLoop();
+ void fixVectorizedLoop(VPTransformState &State);
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -516,62 +503,31 @@ public:
unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
- /// Vectorize a single PHINode in a block. This method handles the induction
- /// variable canonicalization. It supports both VF = 1 for unrolled loops and
- /// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
- Value *StartV, unsigned UF, ElementCount VF);
+ /// Vectorize a single first-order recurrence or pointer induction PHINode in
+ /// a block. This method handles the induction variable canonicalization. It
+ /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
+ VPTransformState &State);
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
/// inclusive. Uses the VPValue operands from \p Operands instead of \p
/// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
+ void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, Value *Start,
- TruncInst *Trunc = nullptr);
-
- /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
- /// vector or scalar value on-demand if one is not yet available. When
- /// vectorizing a loop, we visit the definition of an instruction before its
- /// uses. When visiting the definition, we either vectorize or scalarize the
- /// instruction, creating an entry for it in the corresponding map. (In some
- /// cases, such as induction variables, we will create both vector and scalar
- /// entries.) Then, as we encounter uses of the definition, we derive values
- /// for each scalar or vector use unless such a value is already available.
- /// For example, if we scalarize a definition and one of its uses is vector,
- /// we build the required vector on-demand with an insertelement sequence
- /// when visiting the use. Otherwise, if the use is scalar, we can use the
- /// existing scalar definition.
- ///
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part. If the value has already been vectorized,
- /// the corresponding vector entry in VectorLoopValueMap is returned. If,
- /// however, the value has a scalar entry in VectorLoopValueMap, we construct
- /// a new vector value on-demand by inserting the scalar values into a vector
- /// with an insertelement sequence. If the value has been neither vectorized
- /// nor scalarized, it must be loop invariant, so we simply broadcast the
- /// value into a vector.
- Value *getOrCreateVectorValue(Value *V, unsigned Part);
-
- void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
- VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
- }
-
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll and vector indices \p Instance. If the value has been
- /// vectorized but not scalarized, the necessary extractelement instruction
- /// will be generated.
- Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+ void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
+ VPValue *Def, VPValue *CastDef,
+ VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
- void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+ void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
+ VPTransformState &State);
/// Try to vectorize interleaved access group \p Group with the base address
/// given in \p Addr, optionally masking the vector operations if \p
@@ -591,12 +547,24 @@ public:
VPValue *Def, VPValue *Addr,
VPValue *StoredValue, VPValue *BlockInMask);
- /// Set the debug location in the builder using the debug location in
- /// the instruction.
- void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+ /// Set the debug location in the builder \p Ptr using the debug location in
+ /// \p V. If \p Ptr is None then it uses the class member's Builder.
+ void setDebugLocFromInst(const Value *V,
+ Optional<IRBuilder<> *> CustomBuilder = None);
/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
- void fixNonInductionPHIs(void);
+ void fixNonInductionPHIs(VPTransformState &State);
+
+ /// Returns true if the reordering of FP operations is not allowed, but we are
+ /// able to vectorize with strict in-order reductions for the given RdxDesc.
+ bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+
+ /// Create a broadcast instruction. This method generates a broadcast
+ /// instruction (shuffle) for loop invariant values and for the induction
+ /// value. If this is the induction variable then we extend it to N, N+1, ...
+ /// this is needed because each iteration in the loop corresponds to a SIMD
+ /// element.
+ virtual Value *getBroadcastInstrs(Value *V);
protected:
friend class LoopVectorizationPlanner;
@@ -620,25 +588,26 @@ protected:
Value *Step, Instruction *DL);
/// Handle all cross-iteration phis in the header.
- void fixCrossIterationPHIs();
+ void fixCrossIterationPHIs(VPTransformState &State);
/// Fix a first-order recurrence. This is the second phase of vectorizing
/// this phi node.
- void fixFirstOrderRecurrence(PHINode *Phi);
+ void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
/// Fix a reduction cross-iteration phi. This is the second phase of
/// vectorizing this phi node.
- void fixReduction(PHINode *Phi);
+ void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
/// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+ void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+ VPTransformState &State);
/// Fixup the LCSSA phi nodes in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
/// already in place, and we exit the vector loop exclusively to the middle
/// block.
- void fixLCSSAPHIs();
+ void fixLCSSAPHIs(VPTransformState &State);
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
@@ -646,16 +615,10 @@ protected:
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
- void truncateToMinimalBitwidths();
-
- /// Create a broadcast instruction. This method generates a broadcast
- /// instruction (shuffle) for loop invariant values and for the induction
- /// value. If this is the induction variable then we extend it to N, N+1, ...
- /// this is needed because each iteration in the loop corresponds to a SIMD
- /// element.
- virtual Value *getBroadcastInstrs(Value *V);
+ void truncateToMinimalBitwidths(VPTransformState &State);
- /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+ /// This function adds
+ /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
/// \p Opcode is relevant for FP induction variable.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -668,7 +631,8 @@ protected:
/// Note that \p EntryVal doesn't have to be an induction variable - it
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
- const InductionDescriptor &ID);
+ const InductionDescriptor &ID, VPValue *Def,
+ VPValue *CastDef, VPTransformState &State);
/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
@@ -677,7 +641,9 @@ protected:
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
- Instruction *EntryVal);
+ Instruction *EntryVal, VPValue *Def,
+ VPValue *CastDef,
+ VPTransformState &State);
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
@@ -704,11 +670,10 @@ protected:
/// latter case \p EntryVal is a TruncInst and we must not record anything for
/// that IV, but it's error-prone to expect callers of this routine to care
/// about that, hence this explicit parameter.
- void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
- const Instruction *EntryVal,
- Value *VectorLoopValue,
- unsigned Part,
- unsigned Lane = UINT_MAX);
+ void recordVectorLoopValueForInductionCast(
+ const InductionDescriptor &ID, const Instruction *EntryVal,
+ Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
+ unsigned Part, unsigned Lane = UINT_MAX);
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -729,11 +694,14 @@ protected:
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
/// Emit a bypass check to see if all of the SCEV assumptions we've
- /// had to make are correct.
- void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+ /// had to make are correct. Returns the block containing the checks or
+ /// nullptr if no checks have been added.
+ BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
/// Emit bypass checks to check any memory assumptions we may have made.
- void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+ /// Returns the block containing the checks or nullptr if no checks have been
+ /// added.
+ BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
@@ -848,7 +816,7 @@ protected:
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
- /// The (unique) ExitBlock of the scalar loop. Note that
+ /// The unique ExitBlock of the scalar loop if one exists. Note that
/// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
@@ -867,12 +835,6 @@ protected:
/// The induction variable of the old basic block.
PHINode *OldInduction = nullptr;
- /// Maps values from the original loop to their corresponding values in the
- /// vectorized loop. A key value can map to either vector values, scalar
- /// values or both kinds of values, depending on whether the key was
- /// vectorized and scalarized.
- VectorizerValueMap VectorLoopValueMap;
-
/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;
@@ -906,6 +868,10 @@ protected:
// Whether this loop should be optimized for size based on profile guided size
// optimizatios.
bool OptForSizeBasedOnProfile;
+
+ /// Structure to hold information about generated runtime checks, responsible
+ /// for cleaning the checks, if vectorization turns out unprofitable.
+ GeneratedRTChecks &RTChecks;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -917,10 +883,10 @@ public:
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
- BFI, PSI) {}
+ BFI, PSI, Check) {}
private:
Value *getBroadcastInstrs(Value *V) override;
@@ -969,9 +935,11 @@ public:
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Checks)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+ EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
+ Checks),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -1005,9 +973,10 @@ public:
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Check)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI) {}
+ EPI, LVL, CM, BFI, PSI, Check) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
@@ -1027,17 +996,16 @@ protected:
// their epilogues.
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
public:
- EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
- LoopInfo *LI, DominatorTree *DT,
- const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE,
- EpilogueLoopVectorizationInfo &EPI,
- LoopVectorizationLegality *LVL,
- llvm::LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+ EpilogueVectorizerEpilogueLoop(
+ Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ DominatorTree *DT, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+ LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ GeneratedRTChecks &Checks)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI) {}
+ EPI, LVL, CM, BFI, PSI, Checks) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
@@ -1064,8 +1032,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (I->getDebugLoc() != Empty)
return I;
- for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
- if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+ for (Use &Op : I->operands()) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->getDebugLoc() != Empty)
return OpInst;
}
@@ -1073,34 +1041,38 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
return I;
}
-void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+void InnerLoopVectorizer::setDebugLocFromInst(
+ const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
+ IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
const DILocation *DIL = Inst->getDebugLoc();
+
+ // When a FSDiscriminator is enabled, we don't need to add the multiply
+ // factors to the discriminators.
if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst)) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
+ // FIXME: For scalable vectors, assume vscale=1.
auto NewDIL =
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
if (NewDIL)
- B.SetCurrentDebugLocation(NewDIL.getValue());
+ B->SetCurrentDebugLocation(NewDIL.getValue());
else
LLVM_DEBUG(dbgs()
<< "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL->getLine());
- }
- else
- B.SetCurrentDebugLocation(DIL);
+ } else
+ B->SetCurrentDebugLocation(DIL);
} else
- B.SetCurrentDebugLocation(DebugLoc());
+ B->SetCurrentDebugLocation(DebugLoc());
}
-/// Write a record \p DebugMsg about vectorization failure to the debug
-/// output stream. If \p I is passed, it is an instruction that prevents
-/// vectorization.
+/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
+/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
-static void debugVectorizationFailure(const StringRef DebugMsg,
- Instruction *I) {
- dbgs() << "LV: Not vectorizing: " << DebugMsg;
+static void debugVectorizationMessage(const StringRef Prefix,
+ const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: " << Prefix << DebugMsg;
if (I != nullptr)
dbgs() << " " << *I;
else
@@ -1129,9 +1101,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
DL = I->getDebugLoc();
}
- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
- R << "loop not vectorized: ";
- return R;
+ return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
/// Return a value for Step multiplied by VF.
@@ -1145,13 +1115,31 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
namespace llvm {
+/// Return the runtime value for VF.
+Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+ Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
+ return VF.isScalable() ? B.CreateVScale(EC) : EC;
+}
+
void reportVectorizationFailure(const StringRef DebugMsg,
- const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
- LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I) {
+ LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
- ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
- ORETag, TheLoop, I) << OREMsg);
+ ORE->emit(
+ createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
+ << "loop not vectorized: " << OREMsg);
+}
+
+void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I) {
+ LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+ ORE->emit(
+ createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
+ << Msg);
}
} // end namespace llvm
@@ -1220,6 +1208,16 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};
+/// ElementCountComparator creates a total ordering for ElementCount
+/// for the purposes of using it in a set structure.
+struct ElementCountComparator {
+ bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
+ return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+ std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
+ }
+};
+using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -1242,27 +1240,32 @@ public:
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {}
- /// \return An upper bound for the vectorization factor, or None if
- /// vectorization and interleaving should be avoided up front.
- Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
+ /// \return An upper bound for the vectorization factors (both fixed and
+ /// scalable). If the factors are 0, vectorization and interleaving should be
+ /// avoided up front.
+ FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
bool runtimeChecksRequired();
/// \return The most profitable vectorization factor and the cost of that VF.
- /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+ /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+ VectorizationFactor
+ selectVectorizationFactor(const ElementCountSet &CandidateVFs);
+
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF,
const LoopVectorizationPlanner &LVP);
/// Setup cost-based decisions for user vectorization factor.
- void selectUserVectorizationFactor(ElementCount UserVF) {
+ /// \return true if the UserVF is a feasible VF to be chosen.
+ bool selectUserVectorizationFactor(ElementCount UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
+ return expectedCost(UserVF).first.isValid();
}
/// \return The size (in bits) of the smallest and widest types in the code
@@ -1304,10 +1307,22 @@ public:
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
+ /// Collect all element types in the loop for which widening is needed.
+ void collectElementTypesForWidening();
+
/// Split reductions into those that happen in the loop, and those that happen
/// outside. In loop reductions are collected into InLoopReductionChains.
void collectInLoopReductions();
+ /// Returns true if we should use strict in-order reductions for the given
+ /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
+ /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
+ /// of FP operations.
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
+ return EnableStrictReductions && !Hints->allowReordering() &&
+ RdxDesc.isOrdered();
+ }
+
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
@@ -1411,7 +1426,7 @@ public:
/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
- InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
+ InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
assert(VF.isVector() && "Expected VF to be a vector VF");
// Cost model is not run in the VPlan-native path - return conservative
// result until this changes.
@@ -1479,30 +1494,18 @@ public:
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
return Legal->isConsecutivePtr(Ptr) &&
TTI.isLegalMaskedStore(DataType, Alignment);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
return Legal->isConsecutivePtr(Ptr) &&
TTI.isLegalMaskedLoad(DataType, Alignment);
}
- /// Returns true if the target machine supports masked scatter operation
- /// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
- return TTI.isLegalMaskedScatter(DataType, Alignment);
- }
-
- /// Returns true if the target machine supports masked gather operation
- /// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType, Align Alignment) {
- return TTI.isLegalMaskedGather(DataType, Alignment);
- }
-
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
@@ -1510,10 +1513,19 @@ public:
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
return false;
- auto *Ty = getMemInstValueType(V);
+ auto *Ty = getLoadStoreType(V);
Align Align = getLoadStoreAlignment(V);
- return (LI && isLegalMaskedGather(Ty, Align)) ||
- (SI && isLegalMaskedScatter(Ty, Align));
+ return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
+ (SI && TTI.isLegalMaskedScatter(Ty, Align));
+ }
+
+ /// Returns true if the target machine supports all of the reduction
+ /// variables found for the given VF.
+ bool canVectorizeReductions(ElementCount VF) const {
+ return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
+ }));
}
/// Returns true if \p I is an instruction that will be scalarized with
@@ -1521,8 +1533,7 @@ public:
/// instructions that may divide by zero.
/// If a non-zero VF has been calculated, we check if I will be scalarized
/// predication for that VF.
- bool isScalarWithPredication(Instruction *I,
- ElementCount VF = ElementCount::getFixed(1));
+ bool isScalarWithPredication(Instruction *I) const;
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
@@ -1563,14 +1574,14 @@ public:
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue() const {
+ bool requiresScalarEpilogue(ElementCount VF) const {
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
- return InterleaveInfo.requiresScalarEpilogue();
+ return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1582,7 +1593,7 @@ public:
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
- bool blockNeedsPredication(BasicBlock *BB) {
+ bool blockNeedsPredication(BasicBlock *BB) const {
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
@@ -1605,7 +1616,7 @@ public:
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
- InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
+ InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
@@ -1613,7 +1624,12 @@ public:
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize);
+ bool &NeedToScalarize) const;
+
+ /// Returns true if the per-lane cost of VectorizationFactor A is lower than
+ /// that of B.
+ bool isMoreProfitable(const VectorizationFactor &A,
+ const VectorizationFactor &B) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
@@ -1625,26 +1641,48 @@ public:
private:
unsigned NumPredStores = 0;
- /// \return An upper bound for the vectorization factor, a power-of-2 larger
- /// than zero. One is returned if vectorization should best be avoided due
- /// to cost.
- ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF);
+ /// \return An upper bound for the vectorization factors for both
+ /// fixed and scalable vectorization, where the minimum-known number of
+ /// elements is a power-of-2 larger than zero. If scalable vectorization is
+ /// disabled or unsupported, then the scalable part will be equal to
+ /// ElementCount::getScalable(0).
+ FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF);
+
+ /// \return the maximized element count based on the targets vector
+ /// registers and the loop trip-count, but limited to a maximum safe VF.
+ /// This is a helper function of computeFeasibleMaxVF.
+ /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
+ /// issue that occurred on one of the buildbots which cannot be reproduced
+ /// without having access to the properietary compiler (see comments on
+ /// D98509). The issue is currently under investigation and this workaround
+ /// will be removed as soon as possible.
+ ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
+ unsigned SmallestType,
+ unsigned WidestType,
+ const ElementCount &MaxSafeVF);
+
+ /// \return the maximum legal scalable VF, based on the safe max number
+ /// of elements.
+ ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
- /// operate on
- /// vector values after type legalization in the backend. If this latter value
- /// is
- /// false, then all operations will be scalarized (i.e. no vectorization has
- /// actually taken place).
+ /// operate on vector values after type legalization in the backend. If this
+ /// latter value is false, then all operations will be scalarized (i.e. no
+ /// vectorization has actually taken place).
using VectorizationCostTy = std::pair<InstructionCost, bool>;
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
- /// the factor width.
- VectorizationCostTy expectedCost(ElementCount VF);
+ /// the factor width. If \p Invalid is not nullptr, this function
+ /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+ /// each instruction that has an Invalid cost for the given VF.
+ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
+ VectorizationCostTy
+ expectedCost(ElementCount VF,
+ SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
@@ -1657,9 +1695,9 @@ private:
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
- InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
- Type *VectorTy,
- TTI::TargetCostKind CostKind);
+ Optional<InstructionCost>
+ getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+ TTI::TargetCostKind CostKind);
/// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
@@ -1685,7 +1723,8 @@ private:
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
+ InstructionCost getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const;
/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
@@ -1803,7 +1842,7 @@ private:
/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
- ElementCount VF) {
+ ElementCount VF) const {
return SmallVector<Value *, 4>(make_filter_range(
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
@@ -1861,12 +1900,216 @@ public:
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+ /// All element types found in the loop.
+ SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
/// Profitable vector factors.
SmallVector<VectorizationFactor, 8> ProfitableVFs;
};
-
} // end namespace llvm
+/// Helper struct to manage generating runtime checks for vectorization.
+///
+/// The runtime checks are created up-front in temporary blocks to allow better
+/// estimating the cost and un-linked from the existing IR. After deciding to
+/// vectorize, the checks are moved back. If deciding not to vectorize, the
+/// temporary blocks are completely removed.
+class GeneratedRTChecks {
+ /// Basic block which contains the generated SCEV checks, if any.
+ BasicBlock *SCEVCheckBlock = nullptr;
+
+ /// The value representing the result of the generated SCEV checks. If it is
+ /// nullptr, either no SCEV checks have been generated or they have been used.
+ Value *SCEVCheckCond = nullptr;
+
+ /// Basic block which contains the generated memory runtime checks, if any.
+ BasicBlock *MemCheckBlock = nullptr;
+
+ /// The value representing the result of the generated memory runtime checks.
+ /// If it is nullptr, either no memory runtime checks have been generated or
+ /// they have been used.
+ Instruction *MemRuntimeCheckCond = nullptr;
+
+ DominatorTree *DT;
+ LoopInfo *LI;
+
+ SCEVExpander SCEVExp;
+ SCEVExpander MemCheckExp;
+
+public:
+ GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
+ const DataLayout &DL)
+ : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
+ MemCheckExp(SE, DL, "scev.check") {}
+
+ /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
+ /// accurately estimate the cost of the runtime checks. The blocks are
+ /// un-linked from the IR and is added back during vector code generation. If
+ /// there is no vector code generation, the check blocks are removed
+ /// completely.
+ void Create(Loop *L, const LoopAccessInfo &LAI,
+ const SCEVUnionPredicate &UnionPred) {
+
+ BasicBlock *LoopHeader = L->getHeader();
+ BasicBlock *Preheader = L->getLoopPreheader();
+
+ // Use SplitBlock to create blocks for SCEV & memory runtime checks to
+ // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
+ // may be used by SCEVExpander. The blocks will be un-linked from their
+ // predecessors and removed from LI & DT at the end of the function.
+ if (!UnionPred.isAlwaysTrue()) {
+ SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+ nullptr, "vector.scevcheck");
+
+ SCEVCheckCond = SCEVExp.expandCodeForPredicate(
+ &UnionPred, SCEVCheckBlock->getTerminator());
+ }
+
+ const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
+ if (RtPtrChecking.Need) {
+ auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
+ MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
+ "vector.memcheck");
+
+ std::tie(std::ignore, MemRuntimeCheckCond) =
+ addRuntimeChecks(MemCheckBlock->getTerminator(), L,
+ RtPtrChecking.getChecks(), MemCheckExp);
+ assert(MemRuntimeCheckCond &&
+ "no RT checks generated although RtPtrChecking "
+ "claimed checks are required");
+ }
+
+ if (!MemCheckBlock && !SCEVCheckBlock)
+ return;
+
+ // Unhook the temporary block with the checks, update various places
+ // accordingly.
+ if (SCEVCheckBlock)
+ SCEVCheckBlock->replaceAllUsesWith(Preheader);
+ if (MemCheckBlock)
+ MemCheckBlock->replaceAllUsesWith(Preheader);
+
+ if (SCEVCheckBlock) {
+ SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
+ new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
+ Preheader->getTerminator()->eraseFromParent();
+ }
+ if (MemCheckBlock) {
+ MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
+ new UnreachableInst(Preheader->getContext(), MemCheckBlock);
+ Preheader->getTerminator()->eraseFromParent();
+ }
+
+ DT->changeImmediateDominator(LoopHeader, Preheader);
+ if (MemCheckBlock) {
+ DT->eraseNode(MemCheckBlock);
+ LI->removeBlock(MemCheckBlock);
+ }
+ if (SCEVCheckBlock) {
+ DT->eraseNode(SCEVCheckBlock);
+ LI->removeBlock(SCEVCheckBlock);
+ }
+ }
+
+ /// Remove the created SCEV & memory runtime check blocks & instructions, if
+ /// unused.
+ ~GeneratedRTChecks() {
+ SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
+ SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
+ if (!SCEVCheckCond)
+ SCEVCleaner.markResultUsed();
+
+ if (!MemRuntimeCheckCond)
+ MemCheckCleaner.markResultUsed();
+
+ if (MemRuntimeCheckCond) {
+ auto &SE = *MemCheckExp.getSE();
+ // Memory runtime check generation creates compares that use expanded
+ // values. Remove them before running the SCEVExpanderCleaners.
+ for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
+ if (MemCheckExp.isInsertedInstruction(&I))
+ continue;
+ SE.forgetValue(&I);
+ SE.eraseValueFromMap(&I);
+ I.eraseFromParent();
+ }
+ }
+ MemCheckCleaner.cleanup();
+ SCEVCleaner.cleanup();
+
+ if (SCEVCheckCond)
+ SCEVCheckBlock->eraseFromParent();
+ if (MemRuntimeCheckCond)
+ MemCheckBlock->eraseFromParent();
+ }
+
+ /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
+ /// adjusts the branches to branch to the vector preheader or \p Bypass,
+ /// depending on the generated condition.
+ BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *LoopVectorPreHeader,
+ BasicBlock *LoopExitBlock) {
+ if (!SCEVCheckCond)
+ return nullptr;
+ if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
+ if (C->isZero())
+ return nullptr;
+
+ auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
+
+ BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
+ // Create new preheader for vector loop.
+ if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
+ PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
+
+ SCEVCheckBlock->getTerminator()->eraseFromParent();
+ SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
+ Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
+ SCEVCheckBlock);
+
+ DT->addNewBlock(SCEVCheckBlock, Pred);
+ DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
+
+ ReplaceInstWithInst(
+ SCEVCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
+ // Mark the check as used, to prevent it from being removed during cleanup.
+ SCEVCheckCond = nullptr;
+ return SCEVCheckBlock;
+ }
+
+ /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
+ /// the branches to branch to the vector preheader or \p Bypass, depending on
+ /// the generated condition.
+ BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *LoopVectorPreHeader) {
+ // Check if we generated code that checks in runtime if arrays overlap.
+ if (!MemRuntimeCheckCond)
+ return nullptr;
+
+ auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
+ Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
+ MemCheckBlock);
+
+ DT->addNewBlock(MemCheckBlock, Pred);
+ DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
+ MemCheckBlock->moveBefore(LoopVectorPreHeader);
+
+ if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
+ PL->addBasicBlockToLoop(MemCheckBlock, *LI);
+
+ ReplaceInstWithInst(
+ MemCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
+ MemCheckBlock->getTerminator()->setDebugLoc(
+ Pred->getTerminator()->getDebugLoc());
+
+ // Mark the check as used, to prevent it from being removed during cleanup.
+ MemRuntimeCheckCond = nullptr;
+ return MemCheckBlock;
+ }
+};
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2031,7 +2274,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal) {
+ Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
+ VPTransformState &State) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
@@ -2063,16 +2307,20 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
- Value *ConstVF =
- getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
- Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+ Type *StepType = Step->getType();
+ if (Step->getType()->isFloatingPointTy())
+ StepType = IntegerType::get(StepType->getContext(),
+ StepType->getScalarSizeInBits());
+ Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
+ if (Step->getType()->isFloatingPointTy())
+ RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
// Create a vector splat to use in the induction update.
//
// FIXME: If the step is non-constant, we create the vector splat with
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *SplatVF = isa<Constant>(Mul)
? ConstantVector::getSplat(VF, cast<Constant>(Mul))
: Builder.CreateVectorSplat(VF, Mul);
@@ -2085,14 +2333,15 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
VecInd->setDebugLoc(EntryVal->getDebugLoc());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
- VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+ State.set(Def, LastInduction, Part);
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+ recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
+ State, Part);
- LastInduction = cast<Instruction>(addFastMathFlag(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+ LastInduction = cast<Instruction>(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
LastInduction->setDebugLoc(EntryVal->getDebugLoc());
}
@@ -2125,7 +2374,8 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+ Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
+ unsigned Part, unsigned Lane) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
@@ -2144,15 +2394,16 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
- Instruction *CastInst = *Casts.begin();
if (Lane < UINT_MAX)
- VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+ State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
else
- VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+ State.set(CastDef, VectorLoopVal, Part);
}
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
- TruncInst *Trunc) {
+ TruncInst *Trunc, VPValue *Def,
+ VPValue *CastDef,
+ VPTransformState &State) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -2214,13 +2465,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
Value *EntryPart =
getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
ID.getInductionOpcode());
- VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+ State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
+ State, Part);
}
};
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
if (VF.isZero() || VF.isScalar()) {
@@ -2234,7 +2491,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
+ State);
return;
}
@@ -2242,13 +2500,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
+ State);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
- buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
return;
}
@@ -2258,14 +2517,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
Value *ScalarIV = CreateScalarIV(Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
- auto *ValVTy = cast<FixedVectorType>(Val->getType());
- int VLen = ValVTy->getNumElements();
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ ElementCount VLen = ValVTy->getElementCount();
Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
@@ -2274,52 +2533,44 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
SmallVector<Constant *, 8> Indices;
- if (STy->isIntegerTy()) {
- // Create a vector of consecutive numbers from zero to VF.
- for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+ // Create a vector of consecutive numbers from zero to VF.
+ VectorType *InitVecValVTy = ValVTy;
+ Type *InitVecValSTy = STy;
+ if (STy->isFloatingPointTy()) {
+ InitVecValSTy =
+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+ }
+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+ // Add on StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(
+ VLen, ConstantInt::get(InitVecValSTy, StartIdx));
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
- // Add the consecutive indices to the vector value.
- Constant *Cv = ConstantVector::get(Indices);
- assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+ if (STy->isIntegerTy()) {
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
- Step = Builder.CreateMul(Cv, Step);
+ Step = Builder.CreateMul(InitVec, Step);
return Builder.CreateAdd(Val, Step, "induction");
}
// Floating point induction.
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
- // Create a vector of consecutive numbers from zero to VF.
- for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
-
- // Add the consecutive indices to the vector value.
- Constant *Cv = ConstantVector::get(Indices);
-
+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
Step = Builder.CreateVectorSplat(VLen, Step);
-
- // Floating point operations had to be 'fast' to enable the induction.
- FastMathFlags Flags;
- Flags.setFast();
-
- Value *MulOp = Builder.CreateFMul(Cv, Step);
- if (isa<Instruction>(MulOp))
- // Have to check, MulOp may be a constant
- cast<Instruction>(MulOp)->setFastMathFlags(Flags);
-
- Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
- if (isa<Instruction>(BOp))
- cast<Instruction>(BOp)->setFastMathFlags(Flags);
- return BOp;
+ Value *MulOp = Builder.CreateFMul(InitVec, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
- const InductionDescriptor &ID) {
+ const InductionDescriptor &ID,
+ VPValue *Def, VPValue *CastDef,
+ VPTransformState &State) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
@@ -2342,169 +2593,74 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
- unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
- ? 1
- : VF.getKnownMinValue();
- assert((!VF.isScalable() || Lanes == 1) &&
- "Should never scalarize a scalable vector");
- // Compute the scalar steps and save the results in VectorLoopValueMap.
+ bool IsUniform =
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
+ unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
+ // Compute the scalar steps and save the results in State.
+ Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+ ScalarIVTy->getScalarSizeInBits());
+ Type *VecIVTy = nullptr;
+ Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
+ if (!IsUniform && VF.isScalable()) {
+ VecIVTy = VectorType::get(ScalarIVTy, VF);
+ UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
+ SplatStep = Builder.CreateVectorSplat(VF, Step);
+ SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
+ }
+
for (unsigned Part = 0; Part < UF; ++Part) {
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
- ScalarIVTy->getScalarSizeInBits());
- Value *StartIdx =
- createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+ Value *StartIdx0 =
+ createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+
+ if (!IsUniform && VF.isScalable()) {
+ auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
+ auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
if (ScalarIVTy->isFloatingPointTy())
- StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
- StartIdx = addFastMathFlag(Builder.CreateBinOp(
- AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+ InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
+ auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
+ auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
+ State.set(Def, Add, Part);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
+ Part);
+ // It's useful to record the lane values too for the known minimum number
+ // of elements so we do those below. This improves the code quality when
+ // trying to extract the first element, for example.
+ }
+
+ if (ScalarIVTy->isFloatingPointTy())
+ StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
+
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Value *StartIdx = Builder.CreateBinOp(
+ AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
// The step returned by `createStepForVF` is a runtime-evaluated value
// when VF is scalable. Otherwise, it should be folded into a Constant.
assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
- auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
- auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
- VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
- }
- }
-}
-
-Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
- assert(V != Induction && "The new induction variable should not be used.");
- assert(!V->getType()->isVectorTy() && "Can't widen a vector");
- assert(!V->getType()->isVoidTy() && "Type does not produce a value");
-
- // If we have a stride that is replaced by one, do it here. Defer this for
- // the VPlan-native path until we start running Legal checks in that path.
- if (!EnableVPlanNativePath && Legal->hasStride(V))
- V = ConstantInt::get(V->getType(), 1);
-
- // If we have a vector mapped to this value, return it.
- if (VectorLoopValueMap.hasVectorValue(V, Part))
- return VectorLoopValueMap.getVectorValue(V, Part);
-
- // If the value has not been vectorized, check if it has been scalarized
- // instead. If it has been scalarized, and we actually need the value in
- // vector form, we will construct the vector values on demand.
- if (VectorLoopValueMap.hasAnyScalarValue(V)) {
- Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
-
- // If we've scalarized a value, that value should be an instruction.
- auto *I = cast<Instruction>(V);
-
- // If we aren't vectorizing, we can just copy the scalar map values over to
- // the vector map.
- if (VF.isScalar()) {
- VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
- return ScalarValue;
+ auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+ auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
+ State.set(Def, Add, VPIteration(Part, Lane));
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
+ Part, Lane);
}
-
- // Get the last scalar instruction we generated for V and Part. If the value
- // is known to be uniform after vectorization, this corresponds to lane zero
- // of the Part unroll iteration. Otherwise, the last instruction is the one
- // we created for the last vector lane of the Part unroll iteration.
- unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
- ? 0
- : VF.getKnownMinValue() - 1;
- assert((!VF.isScalable() || LastLane == 0) &&
- "Scalable vectorization can't lead to any scalarized values.");
- auto *LastInst = cast<Instruction>(
- VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
-
- // Set the insert point after the last scalarized instruction. This ensures
- // the insertelement sequence will directly follow the scalar definitions.
- auto OldIP = Builder.saveIP();
- auto NewIP = std::next(BasicBlock::iterator(LastInst));
- Builder.SetInsertPoint(&*NewIP);
-
- // However, if we are vectorizing, we need to construct the vector values.
- // If the value is known to be uniform after vectorization, we can just
- // broadcast the scalar value corresponding to lane zero for each unroll
- // iteration. Otherwise, we construct the vector values using insertelement
- // instructions. Since the resulting vectors are stored in
- // VectorLoopValueMap, we will only generate the insertelements once.
- Value *VectorValue = nullptr;
- if (Cost->isUniformAfterVectorization(I, VF)) {
- VectorValue = getBroadcastInstrs(ScalarValue);
- VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
- } else {
- // Initialize packing with insertelements to start from poison.
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
- VectorLoopValueMap.setVectorValue(V, Part, Poison);
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- packScalarIntoVectorValue(V, {Part, Lane});
- VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
- }
- Builder.restoreIP(OldIP);
- return VectorValue;
}
-
- // If this scalar is unknown, assume that it is a constant or that it is
- // loop invariant. Broadcast V and save the value for future uses.
- Value *B = getBroadcastInstrs(V);
- VectorLoopValueMap.setVectorValue(V, Part, B);
- return B;
}
-Value *
-InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
- const VPIteration &Instance) {
- // If the value is not an instruction contained in the loop, it should
- // already be scalar.
- if (OrigLoop->isLoopInvariant(V))
- return V;
-
- assert(Instance.Lane > 0
- ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
- : true && "Uniform values only have lane zero");
-
- // If the value from the original loop has not been vectorized, it is
- // represented by UF x VF scalar values in the new loop. Return the requested
- // scalar value.
- if (VectorLoopValueMap.hasScalarValue(V, Instance))
- return VectorLoopValueMap.getScalarValue(V, Instance);
-
- // If the value has not been scalarized, get its entry in VectorLoopValueMap
- // for the given unroll part. If this entry is not a vector type (i.e., the
- // vectorization factor is one), there is no need to generate an
- // extractelement instruction.
- auto *U = getOrCreateVectorValue(V, Instance.Part);
- if (!U->getType()->isVectorTy()) {
- assert(VF.isScalar() && "Value not scalarized has non-vector type");
- return U;
- }
-
- // Otherwise, the value from the original loop has been vectorized and is
- // represented by UF vector values. Extract and return the requested scalar
- // value from the appropriate vector lane.
- return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
-}
-
-void InnerLoopVectorizer::packScalarIntoVectorValue(
- Value *V, const VPIteration &Instance) {
- assert(V != Induction && "The new induction variable should not be used.");
- assert(!V->getType()->isVectorTy() && "Can't pack a vector");
- assert(!V->getType()->isVoidTy() && "Type does not produce a value");
-
- Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
- Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
- VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
- Builder.getInt32(Instance.Lane));
- VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
+void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
+ const VPIteration &Instance,
+ VPTransformState &State) {
+ Value *ScalarInst = State.get(Def, Instance);
+ Value *VectorValue = State.get(Def, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(
+ VectorValue, ScalarInst,
+ Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
+ State.set(Def, VectorValue, Instance.Part);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
- assert(!VF.isScalable() && "Cannot reverse scalable vectors");
- SmallVector<int, 8> ShuffleMask;
- for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
- ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
-
- return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
+ return Builder.CreateVectorReverse(Vec, "reverse");
}
// Return whether we allow using masked interleave-groups (for dealing with
@@ -2554,7 +2710,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
- Type *ScalarTy = getMemInstValueType(Instr);
+ Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
@@ -2573,14 +2729,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
- assert(!VF.isScalable() &&
- "scalable vector reverse operation is not implemented");
if (Group->isReverse())
Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
- Value *AddrPart = State.get(Addr, {Part, 0});
- setDebugLocFromInst(Builder, AddrPart);
+ Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
+ setDebugLocFromInst(AddrPart);
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2606,12 +2760,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
}
- setDebugLocFromInst(Builder, Instr);
+ setDebugLocFromInst(Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
Value *MaskForGaps = nullptr;
if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
@@ -2628,7 +2781,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *GroupMask = MaskForGaps;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *ShuffledMask = Builder.CreateShuffleVector(
BlockInMaskPart,
createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
@@ -2639,7 +2791,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
: ShuffledMask;
}
NewLoad =
- Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
+ Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
GroupMask, PoisonVec, "wide.masked.vec");
}
else
@@ -2659,7 +2811,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (!Member)
continue;
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto StrideMask =
createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2676,7 +2827,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (Group->isReverse())
StridedVec = reverseVector(StridedVec);
- State.set(VPDefs[J], Member, StridedVec, Part);
+ State.set(VPDefs[J], StridedVec, Part);
}
++J;
}
@@ -2684,7 +2835,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
// The sub vector type for current instruction.
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
@@ -2712,7 +2862,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *WideVec = concatenateVectors(Builder, StoredVecs);
// Interleave the elements in the wide vector.
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *IVec = Builder.CreateShuffleVector(
WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
"interleaved.vec");
@@ -2753,7 +2902,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
"CM decision is not to widen the memory instruction");
- Type *ScalarDataTy = getMemInstValueType(Instr);
+ Type *ScalarDataTy = getLoadStoreType(Instr);
auto *DataTy = VectorType::get(ScalarDataTy, VF);
const Align Alignment = getLoadStoreAlignment(Instr);
@@ -2785,18 +2934,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();
-
if (Reverse) {
- assert(!VF.isScalable() &&
- "Reversing vectors is not yet supported for scalable vectors.");
-
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
- PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
- ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
+ // RunTimeVF = VScale * VF.getKnownMinValue()
+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+ Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
+ // NumElt = -Part * RunTimeVF
+ Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ // LastLane = 1 - RunTimeVF
+ Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ PartPtr =
+ cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
- ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
@@ -2813,7 +2965,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
// Handle Stores:
if (SI) {
- setDebugLocFromInst(Builder, SI);
+ setDebugLocFromInst(SI);
for (unsigned Part = 0; Part < UF; ++Part) {
Instruction *NewSI = nullptr;
@@ -2831,7 +2983,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
BlockInMaskParts[Part]);
@@ -2845,21 +2997,21 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
// Handle loads.
assert(LI && "Must have a load instruction");
- setDebugLocFromInst(Builder, LI);
+ setDebugLocFromInst(LI);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(Addr, Part);
- NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+ NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
- VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
- "wide.masked.load");
+ DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
+ PoisonValue::get(DataTy), "wide.masked.load");
else
NewLI =
Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
@@ -2870,11 +3022,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
NewLI = reverseVector(NewLI);
}
- State.set(Def, Instr, NewLI, Part);
+ State.set(Def, NewLI, Part);
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
+ VPUser &User,
const VPIteration &Instance,
bool IfPredicateInstr,
VPTransformState &State) {
@@ -2883,10 +3036,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
// the first lane and part.
if (isa<NoAliasScopeDeclInst>(Instr))
- if (Instance.Lane != 0 || Instance.Part != 0)
+ if (!Instance.isFirstIteration())
return;
- setDebugLocFromInst(Builder, Instr);
+ setDebugLocFromInst(Instr);
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2895,6 +3048,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
+ State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
+ Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
@@ -2902,7 +3057,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
auto InputInstance = Instance;
if (!Operand || !OrigLoop->contains(Operand) ||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
- InputInstance.Lane = 0;
+ InputInstance.Lane = VPLane::getFirstLane();
auto *NewOp = State.get(User.getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
@@ -2911,15 +3066,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
- // representing scalar values in VPTransformState. Add the cloned scalar to
- // the scalar map entry.
- VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
+ State.set(Def, Cloned, Instance);
// If we just cloned a new assumption, add it the assumption cache.
- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
+ if (auto *II = dyn_cast<AssumeInst>(Cloned))
+ AC->registerAssumption(II);
// End if-block.
if (IfPredicateInstr)
@@ -2936,21 +3087,28 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
if (!Latch)
Latch = Header;
- IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+ IRBuilder<> B(&*Header->getFirstInsertionPt());
Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
- setDebugLocFromInst(Builder, OldInst);
- auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+ setDebugLocFromInst(OldInst, &B);
+ auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
- Builder.SetInsertPoint(Latch->getTerminator());
- setDebugLocFromInst(Builder, OldInst);
+ B.SetInsertPoint(Latch->getTerminator());
+ setDebugLocFromInst(OldInst, &B);
// Create i+1 and fill the PHINode.
- Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+ //
+ // If the tail is not folded, we know that End - Start >= Step (either
+ // statically or through the minimum iteration checks). We also know that both
+ // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
+ // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
+ // overflows and we can mark the induction increment as NUW.
+ Value *Next = B.CreateAdd(Induction, Step, "index.next",
+ /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
Induction->addIncoming(Start, L->getLoopPreheader());
Induction->addIncoming(Next, Latch);
// Create the compare.
- Value *ICmp = Builder.CreateICmpEQ(Next, End);
- Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
+ Value *ICmp = B.CreateICmpEQ(Next, End);
+ B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();
@@ -3038,18 +3196,13 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
- // There are two cases where we need to ensure (at least) the last iteration
- // runs in the scalar remainder loop. Thus, if the step evenly divides
- // the trip count, we set the remainder to be equal to the step. If the step
- // does not evenly divide the trip count, no adjustment is necessary since
- // there will already be scalar iterations. Note that the minimum iterations
- // check ensures that N >= Step. The cases are:
- // 1) If there is a non-reversed interleaved group that may speculatively
- // access memory out-of-bounds.
- // 2) If any instruction may follow a conditionally taken exit. That is, if
- // the loop contains multiple exiting blocks, or a single exiting block
- // which is not the latch.
- if (VF.isVector() && Cost->requiresScalarEpilogue()) {
+ // There are cases where we *must* run at least one iteration in the remainder
+ // loop. See the cost model for when this can happen. If the step evenly
+ // divides the trip count, we set the remainder to be equal to the step. If
+ // the step does not evenly divide the trip count, no adjustment is necessary
+ // since there will already be scalar iterations. Note that the minimum
+ // iterations check ensures that N >= Step.
+ if (Cost->requiresScalarEpilogue(VF)) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -3103,8 +3256,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
@@ -3122,9 +3275,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass");
- // Update dominator for Bypass & LoopExit.
+ // Update dominator for Bypass & LoopExit (if needed).
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+ if (!Cost->requiresScalarEpilogue(VF))
+ // If there is an epilogue which must run, there's no edge from the
+ // middle block to exit blocks and thus no need to update the immediate
+ // dominator of the exit blocks.
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
ReplaceInstWithInst(
TCCheckBlock->getTerminator(),
@@ -3132,63 +3289,48 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
LoopBypassBlocks.push_back(TCCheckBlock);
}
-void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
- // Reuse existing vector loop preheader for SCEV checks.
- // Note that new preheader block is generated for vector loop.
- BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
-
- // Generate the code to check that the SCEV assumptions that we made.
- // We want the new basic block to start at the first instruction in a
- // sequence of instructions that form a check.
- SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
- "scev.check");
- Value *SCEVCheck = Exp.expandCodeForPredicate(
- &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
-
- if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
- if (C->isZero())
- return;
+BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+
+ BasicBlock *const SCEVCheckBlock =
+ RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
+ if (!SCEVCheckBlock)
+ return nullptr;
assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
(OptForSizeBasedOnProfile &&
Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size");
- SCEVCheckBlock->setName("vector.scevcheck");
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
- nullptr, "vector.ph");
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
+ if (!Cost->requiresScalarEpilogue(VF))
+ // If there is an epilogue which must run, there's no edge from the
+ // middle block to exit blocks and thus no need to update the immediate
+ // dominator of the exit blocks.
+ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
}
- ReplaceInstWithInst(
- SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
+ return SCEVCheckBlock;
}
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+ BasicBlock *Bypass) {
// VPlan-native path does not do any analysis for runtime checks currently.
if (EnableVPlanNativePath)
- return;
+ return nullptr;
- // Reuse existing vector loop preheader for runtime memory checks.
- // Note that new preheader block is generated for vector loop.
- BasicBlock *const MemCheckBlock = L->getLoopPreheader();
+ BasicBlock *const MemCheckBlock =
+ RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- auto *LAI = Legal->getLAI();
- const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
- if (!RtPtrChecking.Need)
- return;
+ // Check if we generated code that checks in runtime if arrays overlap. We put
+ // the checks into a separate block to make the more common case of few
+ // elements faster.
+ if (!MemCheckBlock)
+ return nullptr;
if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
@@ -3204,32 +3346,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
});
}
- MemCheckBlock->setName("vector.memcheck");
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
- "vector.ph");
-
- auto *CondBranch = cast<BranchInst>(
- Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
- ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
LoopBypassBlocks.push_back(MemCheckBlock);
- AddedSafetyChecks = true;
- // Update dominator only if this is first RT check.
- if (LoopBypassBlocks.empty()) {
- DT->changeImmediateDominator(Bypass, MemCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
- }
-
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
- RtPtrChecking.getChecks(), RtPtrChecking.getSE());
- assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
- "claimed checks are required");
- CondBranch->setCondition(MemRuntimeCheck);
+ AddedSafetyChecks = true;
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
@@ -3238,6 +3357,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
DT, PSE.getSE());
LVer->prepareNoAliasMetadata();
+ return MemCheckBlock;
}
Value *InnerLoopVectorizer::emitTransformedIndex(
@@ -3247,8 +3367,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
auto StartValue = ID.getStartValue();
- assert(Index->getType() == Step->getType() &&
- "Index type does not match StepValue type");
+ assert(Index->getType()->getScalarType() == Step->getType() &&
+ "Index scalar type does not match StepValue type");
// Note: the IR at this point is broken. We cannot use SE to create any new
// SCEV and then expand it, hoping that SCEV's simplification will give us
@@ -3267,14 +3387,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateAdd(X, Y);
};
+ // We allow X to be a vector type, in which case Y will potentially be
+ // splatted into a vector with the same element count.
auto CreateMul = [&B](Value *X, Value *Y) {
- assert(X->getType() == Y->getType() && "Types don't match!");
+ assert(X->getType()->getScalarType() == Y->getType() &&
+ "Types don't match!");
if (auto *CX = dyn_cast<ConstantInt>(X))
if (CX->isOne())
return Y;
if (auto *CY = dyn_cast<ConstantInt>(Y))
if (CY->isOne())
return X;
+ VectorType *XVTy = dyn_cast<VectorType>(X->getType());
+ if (XVTy && !isa<VectorType>(Y->getType()))
+ Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
return B.CreateMul(X, Y);
};
@@ -3290,8 +3416,11 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return LoopVectorBody->getTerminator();
return &*B.GetInsertPoint();
};
+
switch (ID.getKind()) {
case InductionDescriptor::IK_IntInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for integer inductions yet");
assert(Index->getType() == StartValue->getType() &&
"Index type does not match StartValue type");
if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
@@ -3306,9 +3435,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateGEP(
StartValue->getType()->getPointerElementType(), StartValue,
CreateMul(Index,
- Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
+ Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
+ GetInsertPoint())));
}
case InductionDescriptor::IK_FpInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for FP inductions yet");
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
auto InductionBinOp = ID.getInductionBinOp();
assert(InductionBinOp &&
@@ -3317,22 +3449,9 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
"Original bin op should be defined for FP induction");
Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
-
- // Floating point operations had to be 'fast' to enable the induction.
- FastMathFlags Flags;
- Flags.setFast();
-
Value *MulExp = B.CreateFMul(StepValue, Index);
- if (isa<Instruction>(MulExp))
- // We have to check, the MulExp may be a constant.
- cast<Instruction>(MulExp)->setFastMathFlags(Flags);
-
- Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
- "induction");
- if (isa<Instruction>(BOp))
- cast<Instruction>(BOp)->setFastMathFlags(Flags);
-
- return BOp;
+ return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
}
case InductionDescriptor::IK_NoInduction:
return nullptr;
@@ -3343,9 +3462,10 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
- LoopExitBlock = OrigLoop->getUniqueExitBlock();
- assert(LoopExitBlock && "Must have an exit block");
assert(LoopVectorPreHeader && "Invalid loop structure");
+ LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
+ "multiple exit loop without required epilogue?");
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -3354,12 +3474,20 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
- // Set up branch from middle block to the exit and scalar preheader blocks.
- // completeLoopSkeleton will update the condition to use an iteration check,
- // if required to decide whether to execute the remainder.
- BranchInst *BrInst =
- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+
+ // Set up the middle block terminator. Two cases:
+ // 1) If we know that we must execute the scalar epilogue, emit an
+ // unconditional branch.
+ // 2) Otherwise, we must have a single unique exit block (due to how we
+ // implement the multiple exit case). In this case, set up a conditonal
+ // branch from the middle block to the loop scalar preheader, and the
+ // exit block. completeLoopSkeleton will update the condition to use an
+ // iteration check, if required to decide whether to execute the remainder.
+ BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
+ BranchInst::Create(LoopScalarPreHeader) :
+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
+ Builder.getTrue());
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
@@ -3371,7 +3499,11 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
nullptr, nullptr, Twine(Prefix) + "vector.body");
// Update dominator for loop exit.
- DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+ if (!Cost->requiresScalarEpilogue(VF))
+ // If there is an epilogue which must run, there's no edge from the
+ // middle block to exit blocks and thus no need to update the immediate
+ // dominator of the exit blocks.
+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
// Create and register the new vector loop.
Loop *Lp = LI->AllocateLoop();
@@ -3419,6 +3551,11 @@ void InnerLoopVectorizer::createInductionResumeValues(
EndValue = VectorTripCount;
} else {
IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+
+ // Fast-math-flags propagate from the original induction instruction.
+ if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+ B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
@@ -3468,10 +3605,14 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
// Add a check in the middle block to see if we have completed
- // all of the iterations in the first vector loop.
- // If (N - N%VF) == N, then we *don't* need to run the remainder.
- // If tail is to be folded, we know we don't need to run the remainder.
- if (!Cost->foldTailByMasking()) {
+ // all of the iterations in the first vector loop. Three cases:
+ // 1) If we require a scalar epilogue, there is no conditional branch as
+ // we unconditionally branch to the scalar preheader. Do nothing.
+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
+ // Thus if tail is to be folded, we know we don't need to run the
+ // remainder and we can use the previous value for the condition (true).
+ // 3) Otherwise, construct a runtime check.
+ if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
Count, VectorTripCount, "cmp.n",
LoopMiddleBlock->getTerminator());
@@ -3535,23 +3676,32 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
| [ ]_| <-- vector loop.
| |
| v
- | -[ ] <--- middle-block.
- | / |
- | / v
- -|- >[ ] <--- new preheader.
+ \ -[ ] <--- middle-block.
+ \/ |
+ /\ v
+ | ->[ ] <--- new preheader.
| |
- | v
+ (opt) v <-- edge from middle to exit iff epilogue is not required.
| [ ] \
- | [ ]_| <-- old scalar loop to handle remainder.
+ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
\ |
\ v
- >[ ] <-- exit block.
+ >[ ] <-- exit block(s).
...
*/
// Get the metadata of the original loop before it gets modified.
MDNode *OrigLoopID = OrigLoop->getLoopID();
+ // Workaround! Compute the trip count of the original loop and cache it
+ // before we start modifying the CFG. This code has a systemic problem
+ // wherein it tries to run analysis over partially constructed IR; this is
+ // wrong, and not simply for SCEV. The trip count of the original loop
+ // simply happens to be prone to hitting this in practice. In theory, we
+ // can hit the same issue for any SCEV, or ValueTracking query done during
+ // mutation. See PR49900.
+ getOrCreateTripCount(OrigLoop);
+
// Create an empty vector loop, and prepare basic blocks for the runtime
// checks.
Loop *Lp = createVectorLoopSkeleton("");
@@ -3640,6 +3790,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
assert(isa<PHINode>(UI) && "Expected LCSSA form");
IRBuilder<> B(MiddleBlock->getTerminator());
+
+ // Fast-math-flags propagate from the original induction instruction.
+ if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+ B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+
Value *CountMinusOne = B.CreateSub(
CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
Value *CMO =
@@ -3722,8 +3877,7 @@ static void cse(BasicBlock *BB) {
InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ bool &NeedToScalarize) const {
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3770,13 +3924,31 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
return Cost;
}
+static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
+ if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
+ return Elt;
+ return VectorType::get(Elt, VF);
+}
+
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
- ElementCount VF) {
+ ElementCount VF) const {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
-
- IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
+ Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+ FMF = FPMO->getFastMathFlags();
+
+ SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+ FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
+ SmallVector<Type *> ParamTys;
+ std::transform(FTy->param_begin(), FTy->param_end(),
+ std::back_inserter(ParamTys),
+ [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
+
+ IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
+ dyn_cast<IntrinsicInst>(CI));
return TTI.getIntrinsicInstrCost(CostAttrs,
TargetTransformInfo::TCK_RecipThroughput);
}
@@ -3793,27 +3965,27 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) {
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}
-void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
SmallPtrSet<Value *, 4> Erased;
for (const auto &KV : Cost->getMinimalBitwidths()) {
// If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from VectorLoopValueMap indicates that it
+ // type. The absence of the value from State indicates that it
// wasn't vectorized.
- if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ VPValue *Def = State.Plan->getVPValue(KV.first);
+ if (!State.hasAnyVectorValue(Def))
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = getOrCreateVectorValue(KV.first, Part);
+ Value *I = State.get(Def, Part);
if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
continue;
Type *OriginalTy = I->getType();
Type *ScalarTruncatedTy =
IntegerType::get(OriginalTy->getContext(), KV.second);
- auto *TruncatedTy = FixedVectorType::get(
- ScalarTruncatedTy,
- cast<FixedVectorType>(OriginalTy)->getNumElements());
+ auto *TruncatedTy = VectorType::get(
+ ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
if (TruncatedTy == OriginalTy)
continue;
@@ -3863,35 +4035,31 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
break;
}
} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
- ->getNumElements();
+ auto Elements0 =
+ cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
auto *O0 = B.CreateZExtOrTrunc(
- SI->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
- ->getNumElements();
+ SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
+ auto Elements1 =
+ cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
auto *O1 = B.CreateZExtOrTrunc(
- SI->getOperand(1),
- FixedVectorType::get(ScalarTruncatedTy, Elements1));
+ SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
} else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
- ->getNumElements();
+ auto Elements =
+ cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
auto *O0 = B.CreateZExtOrTrunc(
- IE->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements));
+ IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
- ->getNumElements();
+ auto Elements =
+ cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
auto *O0 = B.CreateZExtOrTrunc(
- EE->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements));
+ EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
NewI = B.CreateExtractElement(O0, EE->getOperand(2));
} else {
// If we don't know what to do, be conservative and don't do anything.
@@ -3904,58 +4072,65 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
I->replaceAllUsesWith(Res);
cast<Instruction>(I)->eraseFromParent();
Erased.insert(I);
- VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
+ State.reset(Def, Res, Part);
}
}
// We'll have created a bunch of ZExts that are now parentless. Clean up.
for (const auto &KV : Cost->getMinimalBitwidths()) {
// If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from VectorLoopValueMap indicates that it
+ // type. The absence of the value from State indicates that it
// wasn't vectorized.
- if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ VPValue *Def = State.Plan->getVPValue(KV.first);
+ if (!State.hasAnyVectorValue(Def))
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = getOrCreateVectorValue(KV.first, Part);
+ Value *I = State.get(Def, Part);
ZExtInst *Inst = dyn_cast<ZExtInst>(I);
if (Inst && Inst->use_empty()) {
Value *NewI = Inst->getOperand(0);
Inst->eraseFromParent();
- VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
+ State.reset(Def, NewI, Part);
}
}
}
}
-void InnerLoopVectorizer::fixVectorizedLoop() {
+void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF.isVector())
- truncateToMinimalBitwidths();
+ truncateToMinimalBitwidths(State);
// Fix widened non-induction PHIs by setting up the PHI operands.
if (OrigPHIsToFix.size()) {
assert(EnableVPlanNativePath &&
"Unexpected non-induction PHIs for fixup in non VPlan-native path");
- fixNonInductionPHIs();
+ fixNonInductionPHIs(State);
}
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
// This is the second stage of vectorizing recurrences.
- fixCrossIterationPHIs();
+ fixCrossIterationPHIs(State);
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
- // Fix-up external users of the induction variables.
- for (auto &Entry : Legal->getInductionVars())
- fixupIVUsers(Entry.first, Entry.second,
- getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
- IVEndValues[Entry.first], LoopMiddleBlock);
+ // If we inserted an edge from the middle block to the unique exit block,
+ // update uses outside the loop (phis) to account for the newly inserted
+ // edge.
+ if (!Cost->requiresScalarEpilogue(VF)) {
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs(State);
+ }
- fixLCSSAPHIs();
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -3980,23 +4155,24 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixCrossIterationPHIs() {
+void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #2: We now need to fix the recurrences by adding incoming edges to
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
- for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
- // Handle first-order recurrences and reductions that need to be fixed.
- if (Legal->isFirstOrderRecurrence(&Phi))
- fixFirstOrderRecurrence(&Phi);
- else if (Legal->isReductionVariable(&Phi))
- fixReduction(&Phi);
+ VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
+ fixReduction(ReductionPhi, State);
+ else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+ fixFirstOrderRecurrence(FOR, State);
}
}
-void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
+ VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
@@ -4020,7 +4196,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
//
// In this example, s1 is a recurrence because it's value depends on the
// previous iteration. In the first phase of vectorization, we created a
- // temporary value for s1. We now complete the vectorization and produce the
+ // vector phi v1 for s1. We now complete the vectorization and produce the
// shorthand vector IR shown below (for VF = 4, UF = 1).
//
// vector.ph:
@@ -4046,97 +4222,24 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// After execution completes the vector loop, we extract the next value of
// the recurrence (x) to use as the initial value in the scalar loop.
- // Get the original loop preheader and single loop latch.
- auto *Preheader = OrigLoop->getLoopPreheader();
- auto *Latch = OrigLoop->getLoopLatch();
-
- // Get the initial and previous values of the scalar recurrence.
- auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
- auto *Previous = Phi->getIncomingValueForBlock(Latch);
-
- // Create a vector from the initial value.
- auto *VectorInit = ScalarInit;
- if (VF.isVector()) {
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- VectorInit = Builder.CreateInsertElement(
- PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
- Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
- }
-
- // We constructed a temporary phi node in the first phase of vectorization.
- // This phi node will eventually be deleted.
- Builder.SetInsertPoint(
- cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
-
- // Create a phi node for the new recurrence. The current value will either be
- // the initial value inserted into a vector or loop-varying vector value.
- auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
- VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
-
- // Get the vectorized previous value of the last part UF - 1. It appears last
- // among all unrolled iterations, due to the order of their construction.
- Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
-
- // Find and set the insertion point after the previous value if it is an
- // instruction.
- BasicBlock::iterator InsertPt;
- // Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- // FIXME: Loop invariant values do not form recurrences. We should deal with
- // them earlier.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
- InsertPt = LoopVectorBody->getFirstInsertionPt();
- else {
- Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
- if (isa<PHINode>(PreviousLastPart))
- // If the previous value is a phi node, we should insert after all the phi
- // nodes in the block containing the PHI to avoid breaking basic block
- // verification. Note that the basic block may be different to
- // LoopVectorBody, in case we predicate the loop.
- InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
- else
- InsertPt = ++PreviousInst->getIterator();
- }
- Builder.SetInsertPoint(&*InsertPt);
-
- // We will construct a vector for the recurrence by combining the values for
- // the current and previous iterations. This is the required shuffle mask.
- assert(!VF.isScalable());
- SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
- ShuffleMask[0] = VF.getKnownMinValue() - 1;
- for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
- ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
-
- // The vector from which to take the initial value for the current iteration
- // (actual or unrolled). Initially, this is the vector phi node.
- Value *Incoming = VecPhi;
-
- // Shuffle the current and previous vector and update the vector parts.
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
- Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
- auto *Shuffle =
- VF.isVector()
- ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
- : Incoming;
- PhiPart->replaceAllUsesWith(Shuffle);
- cast<Instruction>(PhiPart)->eraseFromParent();
- VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
- Incoming = PreviousPart;
- }
+ auto *IdxTy = Builder.getInt32Ty();
+ auto *VecPhi = cast<PHINode>(State.get(PhiR, 0));
// Fix the latch value of the new recurrence in the vector loop.
+ VPValue *PreviousDef = PhiR->getBackedgeValue();
+ Value *Incoming = State.get(PreviousDef, UF - 1);
VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *ExtractForScalar = Incoming;
if (VF.isVector()) {
+ auto *One = ConstantInt::get(IdxTy, 1);
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- ExtractForScalar = Builder.CreateExtractElement(
- ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
- "vector.recur.extract");
+ auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
+ auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
+ ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
+ "vector.recur.extract");
}
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
@@ -4144,20 +4247,23 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// will be the value when jumping to the exit block from the LoopMiddleBlock,
// when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF.isVector())
+ if (VF.isVector()) {
+ auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
+ auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
- "vector.recur.extract.for.phi");
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
- // `Incoming`. This is analogous to the vectorized case above: extracting the
- // second last element when VF > 1.
- else if (UF > 1)
- ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
+ Incoming, Idx, "vector.recur.extract.for.phi");
+ } else if (UF > 1)
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
+ // of `Incoming`. This is analogous to the vectorized case above: extracting
+ // the second last element when VF > 1.
+ ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+ PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+ auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
for (auto *BB : predecessors(LoopScalarPreHeader)) {
auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
Start->addIncoming(Incoming, BB);
@@ -4173,44 +4279,49 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// recurrence in the exit block, and then add an edge for the middle block.
// Note that LCSSA does not imply single entry when the original scalar loop
// had multiple exiting edges (as we always run the last iteration in the
- // scalar epilogue); in that case, the exiting path through middle will be
- // dynamically dead and the value picked for the phi doesn't matter.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (any_of(LCSSAPhi.incoming_values(),
- [Phi](Value *V) { return V == Phi; }))
- LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
-}
-
-void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+ // scalar epilogue); in that case, there is no edge from middle to exit and
+ // and thus no phis which needed updated.
+ if (!Cost->requiresScalarEpilogue(VF))
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [Phi](Value *V) { return V == Phi; }))
+ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+}
+
+void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
+ VPTransformState &State) {
+ PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
// Get it's reduction variable descriptor.
- assert(Legal->isReductionVariable(Phi) &&
+ assert(Legal->isReductionVariable(OrigPhi) &&
"Unable to find the reduction variable");
- RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- setDebugLocFromInst(Builder, ReductionStartValue);
- bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
+ setDebugLocFromInst(ReductionStartValue);
+ VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
// This is the vector-clone of the value that leaves the loop.
- Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+ Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(RdxDesc);
+ clearReductionWrapFlags(RdxDesc, State);
// Fix the vector-loop phi.
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
- BasicBlock *Latch = OrigLoop->getLoopLatch();
- Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+ BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
- Value *Val = getOrCreateVectorValue(LoopVal, Part);
- cast<PHINode>(VecRdxPhi)
- ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+ unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF;
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
+ Value *Val = State.get(PhiR->getBackedgeValue(), Part);
+ if (PhiR->isOrdered())
+ Val = State.get(PhiR->getBackedgeValue(), UF - 1);
+
+ cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
}
// Before each round, move the insertion point right between
@@ -4219,16 +4330,16 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- setDebugLocFromInst(Builder, LoopExitInst);
+ setDebugLocFromInst(LoopExitInst);
+ Type *PhiTy = OrigPhi->getType();
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
// be predicated, and does not need to be handled here.
- if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
+ if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecLoopExitInst =
- VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
Value *Sel = nullptr;
for (User *U : VecLoopExitInst->users()) {
if (isa<SelectInst>(U)) {
@@ -4238,19 +4349,19 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
}
assert(Sel && "Reduction exit feeds no select");
- VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+ State.reset(LoopExitInstDef, Sel, Part);
// If the target can create a predicated operator for the reduction at no
// extra cost in the loop (for example a predicated vadd), it can be
// cheaper for the select to remain in the loop than be sunk out of it,
// and so use the select value for the phi instead of the old
// LoopExitValue.
- RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
if (PreferPredicatedReductionSelect ||
TTI->preferPredicatedReductionSelect(
- RdxDesc.getOpcode(), Phi->getType(),
+ RdxDesc.getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags())) {
- auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
+ auto *VecRdxPhi =
+ cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
VecRdxPhi->setIncomingValueForBlock(
LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
}
@@ -4260,15 +4371,14 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
- if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
- assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(
LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ RdxParts[Part] = State.get(LoopExitInstDef, Part);
Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
: Builder.CreateZExt(Trunc, VecTy);
@@ -4284,12 +4394,12 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+ State.reset(LoopExitInstDef, RdxParts[Part], Part);
}
}
// Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
+ Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
// The middle block terminator has already been assigned a DebugLoc here (the
@@ -4299,36 +4409,40 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
- setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
- for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- ReducedPartRdx = addFastMathFlag(
- Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
- ReducedPartRdx, "bin.rdx"),
- RdxDesc.getFastMathFlags());
- else
- ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+ setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+ if (PhiR->isOrdered())
+ ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
+ else {
+ // Floating-point operations should have some FMF to enable the reduction.
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ for (unsigned Part = 1; Part < UF; ++Part) {
+ Value *RdxPart = State.get(LoopExitInstDef, Part);
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ ReducedPartRdx = Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ } else {
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+ }
+ }
}
// Create the reduction after the loop. Note that inloop reductions create the
// target reduction in the loop using a Reduction recipe.
- if (VF.isVector() && !IsInLoopReductionPhi) {
+ if (VF.isVector() && !PhiR->isInLoop()) {
ReducedPartRdx =
createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
- if (Phi->getType() != RdxDesc.getRecurrenceType())
- ReducedPartRdx =
- RdxDesc.isSigned()
- ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
- : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+ if (PhiTy != RdxDesc.getRecurrenceType())
+ ReducedPartRdx = RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+ : Builder.CreateZExt(ReducedPartRdx, PhiTy);
}
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
- PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+ PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
@@ -4340,24 +4454,25 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// We know that the loop is in LCSSA form. We need to update the PHI nodes
// in the exit blocks. See comment on analogous loop in
// fixFirstOrderRecurrence for a more complete explaination of the logic.
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (any_of(LCSSAPhi.incoming_values(),
- [LoopExitInst](Value *V) { return V == LoopExitInst; }))
- LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ if (!Cost->requiresScalarEpilogue(VF))
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis())
+ if (any_of(LCSSAPhi.incoming_values(),
+ [LoopExitInst](Value *V) { return V == LoopExitInst; }))
+ LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
- Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
- Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+ OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
-void InnerLoopVectorizer::clearReductionWrapFlags(
- RecurrenceDescriptor &RdxDesc) {
+void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+ VPTransformState &State) {
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RK != RecurKind::Add && RK != RecurKind::Mul)
return;
@@ -4373,7 +4488,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
Instruction *Cur = Worklist.pop_back_val();
if (isa<OverflowingBinaryOperator>(Cur))
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *V = getOrCreateVectorValue(Cur, Part);
+ Value *V = State.get(State.Plan->getVPValue(Cur), Part);
cast<Instruction>(V)->dropPoisonGeneratingFlags();
}
@@ -4386,7 +4501,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
}
}
-void InnerLoopVectorizer::fixLCSSAPHIs() {
+void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
// Some phis were already hand updated by the reduction and recurrence
@@ -4395,19 +4510,21 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
// Non-instruction incoming values will have only one value.
- unsigned LastLane = 0;
- if (isa<Instruction>(IncomingValue))
- LastLane = Cost->isUniformAfterVectorization(
- cast<Instruction>(IncomingValue), VF)
- ? 0
- : VF.getKnownMinValue() - 1;
- assert((!VF.isScalable() || LastLane == 0) &&
- "scalable vectors dont support non-uniform scalars yet");
+
+ VPLane Lane = VPLane::getFirstLane();
+ if (isa<Instruction>(IncomingValue) &&
+ !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
+ VF))
+ Lane = VPLane::getLastLaneForVF(VF);
+
// Can be a loop invariant incoming value or the last scalar value to be
// extracted from the vectorized loop.
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Value *lastIncomingValue =
- getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+ OrigLoop->isLoopInvariant(IncomingValue)
+ ? IncomingValue
+ : State.get(State.Plan->getVPValue(IncomingValue),
+ VPIteration(UF - 1, Lane));
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}
@@ -4450,12 +4567,22 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
while (!Worklist.empty()) {
auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
- // We can't sink an instruction if it is a phi node, is already in the
- // predicated block, is not in the loop, or may have side effects.
- if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
- !VectorLoop->contains(I) || I->mayHaveSideEffects())
+ // We can't sink an instruction if it is a phi node, is not in the loop,
+ // or may have side effects.
+ if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
+ I->mayHaveSideEffects())
continue;
+ // If the instruction is already in PredBB, check if we can sink its
+ // operands. In that case, VPlan's sinkScalarOperands() succeeded in
+ // sinking the scalar instruction I, hence it appears in PredBB; but it
+ // may have failed to sink I's operands (recursively), which we try
+ // (again) here.
+ if (I->getParent() == PredBB) {
+ Worklist.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+
// It's legal to sink the instruction if all its uses occur in the
// predicated block. Otherwise, there's nothing to do yet, and we may
// need to reanalyze the instruction.
@@ -4476,42 +4603,25 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
-void InnerLoopVectorizer::fixNonInductionPHIs() {
+void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
for (PHINode *OrigPhi : OrigPHIsToFix) {
- PHINode *NewPhi =
- cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
- unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
-
- SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
- predecessors(OrigPhi->getParent()));
- SmallVector<BasicBlock *, 2> VectorBBPredecessors(
- predecessors(NewPhi->getParent()));
- assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
- "Scalar and Vector BB should have the same number of predecessors");
-
- // The insertion point in Builder may be invalidated by the time we get
- // here. Force the Builder insertion point to something valid so that we do
- // not run into issues during insertion point restore in
- // getOrCreateVectorValue calls below.
+ VPWidenPHIRecipe *VPPhi =
+ cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
+ PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
+ // Make sure the builder has a valid insert point.
Builder.SetInsertPoint(NewPhi);
-
- // The predecessor order is preserved and we can rely on mapping between
- // scalar and vector block predecessors.
- for (unsigned i = 0; i < NumIncomingValues; ++i) {
- BasicBlock *NewPredBB = VectorBBPredecessors[i];
-
- // When looking up the new scalar/vector values to fix up, use incoming
- // values from original phi.
- Value *ScIncV =
- OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
-
- // Scalar incoming value may need a broadcast
- Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
- NewPhi->addIncoming(NewIncV, NewPredBB);
+ for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+ VPValue *Inc = VPPhi->getIncomingValue(i);
+ VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+ NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
}
}
}
+bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+ return Cost->useOrderedReductions(RdxDesc);
+}
+
void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
VPUser &Operands, unsigned UF,
ElementCount VF, bool IsPtrLoopInvariant,
@@ -4539,7 +4649,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
auto *Clone = Builder.Insert(GEP->clone());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- State.set(VPDef, GEP, EntryPart, Part);
+ State.set(VPDef, EntryPart, Part);
addMetadata(EntryPart, GEP);
}
} else {
@@ -4553,8 +4663,9 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
for (unsigned Part = 0; Part < UF; ++Part) {
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
- : State.get(Operands.getOperand(0), Part);
+ auto *Ptr = IsPtrLoopInvariant
+ ? State.get(Operands.getOperand(0), VPIteration(0, 0))
+ : State.get(Operands.getOperand(0), Part);
// Collect all the indices for the new GEP. If any index is
// loop-invariant, we won't broadcast it.
@@ -4562,7 +4673,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
VPValue *Operand = Operands.getOperand(I);
if (IsIndexLoopInvariant[I - 1])
- Indices.push_back(State.get(Operand, {0, 0}));
+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));
else
Indices.push_back(State.get(Operand, Part));
}
@@ -4576,27 +4687,26 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
: Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
- State.set(VPDef, GEP, NewGEP, Part);
+ State.set(VPDef, NewGEP, Part);
addMetadata(NewGEP, GEP);
}
}
}
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
- RecurrenceDescriptor *RdxDesc,
- Value *StartV, unsigned UF,
- ElementCount VF) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ VPWidenPHIRecipe *PhiR,
+ VPTransformState &State) {
PHINode *P = cast<PHINode>(PN);
if (EnableVPlanNativePath) {
// Currently we enter here in the VPlan-native path for non-induction
// PHIs where all control flow is uniform. We simply widen these PHIs.
// Create a vector phi with no operands - the vector phi operands will be
// set at the end of vector code generation.
- Type *VecTy =
- (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ Type *VecTy = (State.VF.isScalar())
+ ? PN->getType()
+ : VectorType::get(PN->getType(), State.VF);
Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
- VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+ State.set(PhiR, VecPhi, 0);
OrigPHIsToFix.push_back(P);
return;
@@ -4609,61 +4719,11 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
- Value *Iden = nullptr;
- bool ScalarPHI =
- (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
- Type *VecTy =
- ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
-
- if (RdxDesc) {
- assert(Legal->isReductionVariable(P) && StartV &&
- "RdxDesc should only be set for reduction variables; in that case "
- "a StartV is also required");
- RecurKind RK = RdxDesc->getRecurrenceKind();
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
- // MinMax reduction have the start value as their identify.
- if (ScalarPHI) {
- Iden = StartV;
- } else {
- IRBuilderBase::InsertPointGuard IPBuilder(Builder);
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
- }
- } else {
- Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
- RK, VecTy->getScalarType());
- Iden = IdenC;
-
- if (!ScalarPHI) {
- Iden = ConstantVector::getSplat(VF, IdenC);
- IRBuilderBase::InsertPointGuard IPBuilder(Builder);
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- Constant *Zero = Builder.getInt32(0);
- StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
- }
- }
- }
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- // This is phase one of vectorizing PHIs.
- Value *EntryPart = PHINode::Create(
- VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
- VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
- if (StartV) {
- // Make sure to add the reduction start value only to the
- // first unroll part.
- Value *StartVal = (Part == 0) ? StartV : Iden;
- cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
- }
- }
- return;
- }
assert(!Legal->isReductionVariable(P) &&
- "reductions should be handled above");
+ "reductions should be handled elsewhere");
- setDebugLocFromInst(Builder, P);
+ setDebugLocFromInst(P);
// This PHINode must be an induction variable.
// Make sure that we know about it.
@@ -4684,24 +4744,49 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
- if (Cost->isScalarAfterVectorization(P, VF)) {
+ if (Cost->isScalarAfterVectorization(P, State.VF)) {
// This is the normalized GEP that starts counting at zero.
Value *PtrInd =
Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
- unsigned Lanes =
- Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
+ bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
+ unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
+
+ bool NeedsVectorIndex = !IsUniform && VF.isScalable();
+ Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
+ if (NeedsVectorIndex) {
+ Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
+ UnitStepVec = Builder.CreateStepVector(VecIVTy);
+ PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
+ }
+
for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *PartStart = createStepForVF(
+ Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
+
+ if (NeedsVectorIndex) {
+ Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
+ Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
+ Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
+ Value *SclrGep =
+ emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
+ SclrGep->setName("next.gep");
+ State.set(PhiR, SclrGep, Part);
+ // We've cached the whole vector, which means we can support the
+ // extraction of any lane.
+ continue;
+ }
+
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Constant *Idx = ConstantInt::get(PtrInd->getType(),
- Lane + Part * VF.getKnownMinValue());
+ Value *Idx = Builder.CreateAdd(
+ PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep =
emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+ State.set(PhiR, SclrGep, VPIteration(Part, Lane));
}
}
return;
@@ -4724,32 +4809,34 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
SCEVExpander Exp(*PSE.getSE(), DL, "induction");
Value *ScalarStepValue =
Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
+ Value *NumUnrolledElems =
+ Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
Value *InductionGEP = GetElementPtrInst::Create(
ScStValueType->getPointerElementType(), NewPointerPhi,
- Builder.CreateMul(
- ScalarStepValue,
- ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
- "ptr.ind", InductionLoc);
+ Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+ InductionLoc);
NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
// Create UF many actual address geps that use the pointer
// phi as base and a vectorized version of the step value
// (<step*0, ..., step*N>) as offset.
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Constant *, 8> Indices;
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Type *VecPhiType = VectorType::get(PhiType, State.VF);
+ Value *StartOffsetScalar =
+ Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+ Value *StartOffset =
+ Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
// Create a vector of consecutive numbers from zero to VF.
- for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
- Indices.push_back(
- ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
- Constant *StartOffset = ConstantVector::get(Indices);
+ StartOffset =
+ Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
Value *GEP = Builder.CreateGEP(
ScStValueType->getPointerElementType(), NewPointerPhi,
Builder.CreateMul(
- StartOffset,
- Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
+ StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
"vector.gep"));
- VectorLoopValueMap.setVectorValue(P, Part, GEP);
+ State.set(PhiR, GEP, Part);
}
}
}
@@ -4803,7 +4890,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
- setDebugLocFromInst(Builder, &I);
+ setDebugLocFromInst(&I);
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 2> Ops;
@@ -4816,7 +4903,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
VecOp->copyIRFlags(&I);
// Use this vector value for all users of the original instruction.
- State.set(Def, &I, V, Part);
+ State.set(Def, V, Part);
addMetadata(V, &I);
}
@@ -4827,7 +4914,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
// Widen compares. Generate vector compares.
bool FCmp = (I.getOpcode() == Instruction::FCmp);
auto *Cmp = cast<CmpInst>(&I);
- setDebugLocFromInst(Builder, Cmp);
+ setDebugLocFromInst(Cmp);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *A = State.get(User.getOperand(0), Part);
Value *B = State.get(User.getOperand(1), Part);
@@ -4840,7 +4927,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
- State.set(Def, &I, C, Part);
+ State.set(Def, C, Part);
addMetadata(C, &I);
}
@@ -4860,7 +4947,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
case Instruction::FPTrunc:
case Instruction::BitCast: {
auto *CI = cast<CastInst>(&I);
- setDebugLocFromInst(Builder, CI);
+ setDebugLocFromInst(CI);
/// Vectorize casts.
Type *DestTy =
@@ -4869,7 +4956,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
for (unsigned Part = 0; Part < UF; ++Part) {
Value *A = State.get(User.getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- State.set(Def, &I, Cast, Part);
+ State.set(Def, Cast, Part);
addMetadata(Cast, &I);
}
break;
@@ -4886,7 +4973,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPTransformState &State) {
assert(!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- setDebugLocFromInst(Builder, &I);
+ setDebugLocFromInst(&I);
Module *M = I.getParent()->getParent()->getParent();
auto *CI = cast<CallInst>(&I);
@@ -4906,10 +4993,11 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
- assert(IntrinsicCost.isValid() && CallCost.isValid() &&
- "Cannot have invalid costs while widening");
+ assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
+ "Either the intrinsic cost or vector call cost must be valid");
for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Type *, 2> TysForDecl = {CI->getType()};
SmallVector<Value *, 4> Args;
for (auto &I : enumerate(ArgOperands.operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
@@ -4917,19 +5005,19 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
Value *Arg;
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
Arg = State.get(I.value(), Part);
- else
- Arg = State.get(I.value(), {0, 0});
+ else {
+ Arg = State.get(I.value(), VPIteration(0, 0));
+ if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
+ TysForDecl.push_back(Arg->getType());
+ }
Args.push_back(Arg);
}
Function *VectorF;
if (UseVectorIntrinsic) {
// Use vector version of the intrinsic.
- Type *TysForDecl[] = {CI->getType()};
- if (VF.isVector()) {
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ if (VF.isVector())
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
- }
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
@@ -4948,7 +5036,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
if (isa<FPMathOperator>(V))
V->copyFastMathFlags(CI);
- State.set(Def, &I, V, Part);
+ State.set(Def, V, Part);
addMetadata(V, &I);
}
}
@@ -4957,14 +5045,15 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
VPUser &Operands,
bool InvariantCond,
VPTransformState &State) {
- setDebugLocFromInst(Builder, &I);
+ setDebugLocFromInst(&I);
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
- auto *InvarCond =
- InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
+ auto *InvarCond = InvariantCond
+ ? State.get(Operands.getOperand(0), VPIteration(0, 0))
+ : nullptr;
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cond =
@@ -4972,7 +5061,7 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
Value *Op0 = State.get(Operands.getOperand(1), Part);
Value *Op1 = State.get(Operands.getOperand(2), Part);
Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
- State.set(VPDef, &I, Sel, Part);
+ State.set(VPDef, Sel, Part);
addMetadata(Sel, &I);
}
}
@@ -5034,13 +5123,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
if (isScalarPtrInduction(MemAccess, Ptr)) {
Worklist.insert(cast<Instruction>(Ptr));
- Instruction *Update = cast<Instruction>(
- cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
- Worklist.insert(Update);
LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
<< "\n");
- LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
- << "\n");
+
+ Instruction *Update = cast<Instruction>(
+ cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+ ScalarPtrs.insert(Update);
return;
}
// We only care about bitcast and getelementptr instructions contained in
@@ -5054,11 +5142,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Worklist.count(I))
return;
- // If the use of the pointer will be a scalar use, and all users of the
- // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
- // place the pointer in PossibleNonScalarPtrs.
- if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
- return isa<LoadInst>(U) || isa<StoreInst>(U);
+ // If all users of the pointer will be memory accesses and scalar, place the
+ // pointer in ScalarPtrs. Otherwise, place the pointer in
+ // PossibleNonScalarPtrs.
+ if (llvm::all_of(I->users(), [&](User *U) {
+ return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+ isScalarUse(cast<Instruction>(U), Ptr);
}))
ScalarPtrs.insert(I);
else
@@ -5164,8 +5253,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
- ElementCount VF) {
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
if (!blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -5176,20 +5264,12 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
if (!Legal->isMaskRequired(I))
return false;
auto *Ptr = getLoadStorePointerOperand(I);
- auto *Ty = getMemInstValueType(I);
- // We have already decided how to vectorize this instruction, get that
- // result.
- if (VF.isVector()) {
- InstWidening WideningDecision = getWideningDecision(I, VF);
- assert(WideningDecision != CM_Unknown &&
- "Widening decision should be ready at this moment");
- return WideningDecision == CM_Scalarize;
- }
+ auto *Ty = getLoadStoreType(I);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
- isLegalMaskedGather(Ty, Alignment))
+ TTI.isLegalMaskedGather(Ty, Alignment))
: !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
- isLegalMaskedScatter(Ty, Alignment));
+ TTI.isLegalMaskedScatter(Ty, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -5211,8 +5291,8 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
- auto *ScalarTy = getMemInstValueType(I);
- if (hasIrregularType(ScalarTy, DL, VF))
+ auto *ScalarTy = getLoadStoreType(I);
+ if (hasIrregularType(ScalarTy, DL))
return false;
// Check if masking is required.
@@ -5231,7 +5311,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
assert(useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled.");
- auto *Ty = getMemInstValueType(I);
+ auto *Ty = getLoadStoreType(I);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
: TTI.isLegalMaskedStore(Ty, Alignment);
@@ -5259,7 +5339,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
- if (hasIrregularType(ScalarTy, DL, VF))
+ if (hasIrregularType(ScalarTy, DL))
return false;
return true;
@@ -5302,7 +5382,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I, VF)) {
+ if (isScalarWithPredication(I)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
return;
@@ -5347,7 +5427,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// here is something which only demands lane 0 of the unrolled iterations;
// it does not imply that all lanes produce the same value (e.g. this is not
// the usual meaning of uniform)
- SmallPtrSet<Value *, 8> HasUniformUse;
+ SetVector<Value *> HasUniformUse;
// Scan the loop for instructions which are either a) known to have only
// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
@@ -5483,7 +5563,158 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-Optional<ElementCount>
+ElementCount
+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+ if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
+ reportVectorizationInfo(
+ "Disabling scalable vectorization, because target does not "
+ "support scalable vectors.",
+ "ScalableVectorsUnsupported", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ if (Hints->isScalableVectorizationDisabled()) {
+ reportVectorizationInfo("Scalable vectorization is explicitly disabled",
+ "ScalableVectorizationDisabled", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ auto MaxScalableVF = ElementCount::getScalable(
+ std::numeric_limits<ElementCount::ScalarTy>::max());
+
+ // Test that the loop-vectorizer can legalize all operations for this MaxVF.
+ // FIXME: While for scalable vectors this is currently sufficient, this should
+ // be replaced by a more detailed mechanism that filters out specific VFs,
+ // instead of invalidating vectorization for a whole set of VFs based on the
+ // MaxVF.
+
+ // Disable scalable vectorization if the loop contains unsupported reductions.
+ if (!canVectorizeReductions(MaxScalableVF)) {
+ reportVectorizationInfo(
+ "Scalable vectorization not supported for the reduction "
+ "operations found in this loop.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ // Disable scalable vectorization if the loop contains any instructions
+ // with element types not supported for scalable vectors.
+ if (any_of(ElementTypesInLoop, [&](Type *Ty) {
+ return !Ty->isVoidTy() &&
+ !this->TTI.isElementTypeLegalForScalableVector(Ty);
+ })) {
+ reportVectorizationInfo("Scalable vectorization is not supported "
+ "for all element types found in this loop.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ if (Legal->isSafeForAnyVectorWidth())
+ return MaxScalableVF;
+
+ // Limit MaxScalableVF by the maximum safe dependence distance.
+ Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+ MaxScalableVF = ElementCount::getScalable(
+ MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+ if (!MaxScalableVF)
+ reportVectorizationInfo(
+ "Max legal vector width too small, scalable vectorization "
+ "unfeasible.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+
+ return MaxScalableVF;
+}
+
+FixedScalableVFPair
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeElements =
+ PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+
+ LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
+ << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
+ << ".\n");
+
+ // First analyze the UserVF, fall back if the UserVF should be ignored.
+ if (UserVF) {
+ auto MaxSafeUserVF =
+ UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
+
+ if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
+ // If `VF=vscale x N` is safe, then so is `VF=N`
+ if (UserVF.isScalable())
+ return FixedScalableVFPair(
+ ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
+ else
+ return UserVF;
+ }
+
+ assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
+
+ // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
+ // is better to ignore the hint and let the compiler choose a suitable VF.
+ if (!UserVF.isScalable()) {
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF="
+ << MaxSafeFixedVF << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeFixedVF);
+ });
+ return MaxSafeFixedVF;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe. Ignoring scalable UserVF.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe. Ignoring the hint to let the compiler pick a "
+ "suitable VF.";
+ });
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+
+ FixedScalableVFPair Result(ElementCount::getFixed(1),
+ ElementCount::getScalable(0));
+ if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
+ WidestType, MaxSafeFixedVF))
+ Result.FixedVF = MaxVF;
+
+ if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
+ WidestType, MaxSafeScalableVF))
+ if (MaxVF.isScalable()) {
+ Result.ScalableVF = MaxVF;
+ LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
+ << "\n");
+ }
+
+ return Result;
+}
+
+FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
@@ -5492,7 +5723,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Not inserting runtime ptr check for divergent target",
"runtime pointer checks needed. Not enabled for divergent target",
"CantVersionLoopWithDivergentTarget", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@@ -5501,14 +5732,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
reportVectorizationFailure("Single iteration (non) loop",
"loop trip count is one, irrelevant for vectorization",
"SingleIterationLoop", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
- ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
-
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return MaxVF;
+ return computeFeasibleMaxVF(TC, UserVF);
case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5530,7 +5759,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// Bail if runtime checks are required, which are not good when optimising
// for size.
if (runtimeChecksRequired())
- return None;
+ return FixedScalableVFPair::getNone();
break;
}
@@ -5546,9 +5775,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return MaxVF;
+ return computeFeasibleMaxVF(TC, UserVF);
}
- return None;
+ return FixedScalableVFPair::getNone();
}
// Now try the tail folding
@@ -5563,33 +5792,44 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- assert(!MaxVF.isScalable() &&
- "Scalable vectors do not yet support tail folding");
- assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
- "MaxVF must be a power of 2");
- unsigned MaxVFtimesIC =
- UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
- // Avoid tail folding if the trip count is known to be a multiple of any VF we
- // chose.
- ScalarEvolution *SE = PSE.getSE();
- const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
- const SCEV *ExitCount = SE->getAddExpr(
- BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
- const SCEV *Rem = SE->getURemExpr(
- ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
- if (Rem->isZero()) {
- // Accept MaxVF if we do not have a tail.
- LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
- return MaxVF;
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+ // Avoid tail folding if the trip count is known to be a multiple of any VF
+ // we chose.
+ // FIXME: The condition below pessimises the case for fixed-width vectors,
+ // when scalable VFs are also candidates for vectorization.
+ if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
+ ElementCount MaxFixedVF = MaxFactors.FixedVF;
+ assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+ "MaxFixedVF must be a power of 2");
+ unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
+ : MaxFixedVF.getFixedValue();
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+ const SCEV *Rem = SE->getURemExpr(
+ SE->applyLoopGuards(ExitCount, TheLoop),
+ SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ if (Rem->isZero()) {
+ // Accept MaxFixedVF if we do not have a tail.
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxFactors;
+ }
}
+ // For scalable vectors, don't use tail folding as this is currently not yet
+ // supported. The code is likely to have ended up here if the tripcount is
+ // low, in which case it makes sense not to use scalable vectors.
+ if (MaxFactors.ScalableVF.isVector())
+ MaxFactors.ScalableVF = ElementCount::getScalable(0);
+
// If we don't know the precise trip count, or if the trip count that we
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
FoldTailByMasking = true;
- return MaxVF;
+ return MaxFactors;
}
// If there was a tail-folding hint/switch, but we can't fold the tail by
@@ -5598,12 +5838,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return MaxVF;
+ return MaxFactors;
}
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
- return None;
+ return FixedScalableVFPair::getNone();
}
if (TC == 0) {
@@ -5611,7 +5851,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Unable to calculate the loop count due to complex control flow",
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
reportVectorizationFailure(
@@ -5620,137 +5860,67 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Enable vectorization of this loop with '#pragma clang loop "
"vectorize(enable)' when compiling with -Os/-Oz",
"NoTailLoopWithOptForSize", ORE, TheLoop);
- return None;
-}
-
-ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF) {
- bool IgnoreScalableUserVF = UserVF.isScalable() &&
- !TTI.supportsScalableVectors() &&
- !ForceTargetSupportsScalableVectors;
- if (IgnoreScalableUserVF) {
- LLVM_DEBUG(
- dbgs() << "LV: Ignoring VF=" << UserVF
- << " because target does not support scalable vectors.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "Ignoring VF=" << ore::NV("UserVF", UserVF)
- << " because target does not support scalable vectors.";
- });
- }
-
- // Beyond this point two scenarios are handled. If UserVF isn't specified
- // then a suitable VF is chosen. If UserVF is specified and there are
- // dependencies, check if it's legal. However, if a UserVF is specified and
- // there are no dependencies, then there's nothing to do.
- if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
- Legal->isSafeForAnyVectorWidth())
- return UserVF;
-
- MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
- unsigned SmallestType, WidestType;
- std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
- unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-
- // Get the maximum safe dependence distance in bits computed by LAA.
- // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
- // the memory accesses that is most restrictive (involved in the smallest
- // dependence distance).
- unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
-
- // If the user vectorization factor is legally unsafe, clamp it to a safe
- // value. Otherwise, return as is.
- if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
- unsigned MaxSafeElements =
- PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
- ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
-
- if (UserVF.isScalable()) {
- Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-
- // Scale VF by vscale before checking if it's safe.
- MaxSafeVF = ElementCount::getScalable(
- MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
-
- if (MaxSafeVF.isZero()) {
- // The dependence distance is too small to use scalable vectors,
- // fallback on fixed.
- LLVM_DEBUG(
- dbgs()
- << "LV: Max legal vector width too small, scalable vectorization "
- "unfeasible. Using fixed-width vectorization instead.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "Max legal vector width too small, scalable vectorization "
- << "unfeasible. Using fixed-width vectorization instead.";
- });
- return computeFeasibleMaxVF(
- ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
- }
- }
-
- LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
-
- if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
- return UserVF;
-
- LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
- << " is unsafe, clamping to max safe VF=" << MaxSafeVF
- << ".\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "User-specified vectorization factor "
- << ore::NV("UserVectorizationFactor", UserVF)
- << " is unsafe, clamping to maximum safe vectorization factor "
- << ore::NV("VectorizationFactor", MaxSafeVF);
- });
- return MaxSafeVF;
- }
-
- WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
+ return FixedScalableVFPair::getNone();
+}
+
+ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
+ unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
+ const ElementCount &MaxSafeVF) {
+ bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
+ TypeSize WidestRegister = TTI.getRegisterBitWidth(
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector);
+
+ // Convenience function to return the minimum of two ElementCounts.
+ auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
+ assert((LHS.isScalable() == RHS.isScalable()) &&
+ "Scalable flags must match");
+ return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
+ };
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
- unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
-
- LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
- << " / " << WidestType << " bits.\n");
+ auto MaxVectorElementCount = ElementCount::get(
+ PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
+ ComputeScalableMaxVF);
+ MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
- << WidestRegister << " bits.\n");
-
- assert(MaxVectorSize <= WidestRegister &&
- "Did not expect to pack so many elements"
- " into one vector!");
- if (MaxVectorSize == 0) {
- LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
- MaxVectorSize = 1;
- return ElementCount::getFixed(MaxVectorSize);
- } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
- isPowerOf2_32(ConstTripCount)) {
+ << (MaxVectorElementCount * WidestType) << " bits.\n");
+
+ if (!MaxVectorElementCount) {
+ LLVM_DEBUG(dbgs() << "LV: The target has no "
+ << (ComputeScalableMaxVF ? "scalable" : "fixed")
+ << " vector registers.\n");
+ return ElementCount::getFixed(1);
+ }
+
+ const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
+ if (ConstTripCount &&
+ ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
+ isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below.
+ // choosing a higher viable VF as done in the loop below. If
+ // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
+ // the TC is less than or equal to the known number of lanes.
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
- MaxVectorSize = ConstTripCount;
- return ElementCount::getFixed(MaxVectorSize);
+ return TripCountEC;
}
- unsigned MaxVF = MaxVectorSize;
- if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+ ElementCount MaxVF = MaxVectorElementCount;
+ if (TTI.shouldMaximizeVectorBandwidth() ||
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ auto MaxVectorElementCountMaxBW = ElementCount::get(
+ PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
+ ComputeScalableMaxVF);
+ MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
+
// Collect all viable vectorization factors larger than the default MaxVF
- // (i.e. MaxVectorSize).
+ // (i.e. MaxVectorElementCount).
SmallVector<ElementCount, 8> VFs;
- unsigned NewMaxVectorSize = WidestRegister / SmallestType;
- for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
- VFs.push_back(ElementCount::getFixed(VS));
+ for (ElementCount VS = MaxVectorElementCount * 2;
+ ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
+ VFs.push_back(VS);
// For each VF calculate its register usage.
auto RUs = calculateRegisterUsage(VFs);
@@ -5759,59 +5929,97 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
// ones.
for (int i = RUs.size() - 1; i >= 0; --i) {
bool Selected = true;
- for (auto& pair : RUs[i].MaxLocalUsers) {
+ for (auto &pair : RUs[i].MaxLocalUsers) {
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
if (pair.second > TargetNumRegisters)
Selected = false;
}
if (Selected) {
- MaxVF = VFs[i].getKnownMinValue();
+ MaxVF = VFs[i];
break;
}
}
- if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
- if (MaxVF < MinVF) {
+ if (ElementCount MinVF =
+ TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
+ if (ElementCount::isKnownLT(MaxVF, MinVF)) {
LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
<< ") with target's minimum: " << MinVF << '\n');
MaxVF = MinVF;
}
}
}
- return ElementCount::getFixed(MaxVF);
+ return MaxVF;
}
-VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
- // FIXME: This can be fixed for scalable vectors later, because at this stage
- // the LoopVectorizer will only consider vectorizing a loop with scalable
- // vectors when the loop has a hint to enable vectorization for a given VF.
- assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
+bool LoopVectorizationCostModel::isMoreProfitable(
+ const VectorizationFactor &A, const VectorizationFactor &B) const {
+ InstructionCost CostA = A.Cost;
+ InstructionCost CostB = B.Cost;
+
+ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
+
+ if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
+ MaxTripCount) {
+ // If we are folding the tail and the trip count is a known (possibly small)
+ // constant, the trip count will be rounded up to an integer number of
+ // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
+ // which we compare directly. When not folding the tail, the total cost will
+ // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
+ // approximated with the per-lane cost below instead of using the tripcount
+ // as here.
+ auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
+ auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
+ return RTCostA < RTCostB;
+ }
+
+ // When set to preferred, for now assume vscale may be larger than 1, so
+ // that scalable vectorization is slightly favorable over fixed-width
+ // vectorization.
+ if (Hints->isScalableVectorizationPreferred())
+ if (A.Width.isScalable() && !B.Width.isScalable())
+ return (CostA * B.Width.getKnownMinValue()) <=
+ (CostB * A.Width.getKnownMinValue());
+
+ // To avoid the need for FP division:
+ // (CostA / A.Width) < (CostB / B.Width)
+ // <=> (CostA * B.Width) < (CostB * A.Width)
+ return (CostA * B.Width.getKnownMinValue()) <
+ (CostB * A.Width.getKnownMinValue());
+}
+VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
+ const ElementCountSet &VFCandidates) {
InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+ assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+ "Expected Scalar VF to be a candidate");
- unsigned Width = 1;
- const float ScalarCost = *ExpectedCost.getValue();
- float Cost = ScalarCost;
+ const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
+ VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- if (ForceVectorization && MaxVF.isVector()) {
+ if (ForceVectorization && VFCandidates.size() > 1) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
- Cost = std::numeric_limits<float>::max();
- }
-
- for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
- // Notice that the vector loop needs to be executed less times, so
- // we need to divide the cost of the vector loops by the width of
- // the vector elements.
- VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
- assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
- float VectorCost = *C.first.getValue() / (float)i;
- LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (int)VectorCost << ".\n");
+ ChosenFactor.Cost = InstructionCost::getMax();
+ }
+
+ SmallVector<InstructionVFPair> InvalidCosts;
+ for (const auto &i : VFCandidates) {
+ // The cost for scalar VF=1 is already calculated, so ignore it.
+ if (i.isScalar())
+ continue;
+
+ VectorizationCostTy C = expectedCost(i, &InvalidCosts);
+ VectorizationFactor Candidate(i, C.first);
+ LLVM_DEBUG(
+ dbgs() << "LV: Vector loop of width " << i << " costs: "
+ << (Candidate.Cost / Candidate.Width.getKnownMinValue())
+ << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
+ << ".\n");
+
if (!C.second && !ForceVectorization) {
LLVM_DEBUG(
dbgs() << "LV: Not considering vector loop of width " << i
@@ -5820,32 +6028,86 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
}
// If profitable add it to ProfitableVF list.
- if (VectorCost < ScalarCost) {
- ProfitableVFs.push_back(VectorizationFactor(
- {ElementCount::getFixed(i), (unsigned)VectorCost}));
- }
-
- if (VectorCost < Cost) {
- Cost = VectorCost;
- Width = i;
- }
+ if (isMoreProfitable(Candidate, ScalarCost))
+ ProfitableVFs.push_back(Candidate);
+
+ if (isMoreProfitable(Candidate, ChosenFactor))
+ ChosenFactor = Candidate;
+ }
+
+ // Emit a report of VFs with invalid costs in the loop.
+ if (!InvalidCosts.empty()) {
+ // Group the remarks per instruction, keeping the instruction order from
+ // InvalidCosts.
+ std::map<Instruction *, unsigned> Numbering;
+ unsigned I = 0;
+ for (auto &Pair : InvalidCosts)
+ if (!Numbering.count(Pair.first))
+ Numbering[Pair.first] = I++;
+
+ // Sort the list, first on instruction(number) then on VF.
+ llvm::sort(InvalidCosts,
+ [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+ if (Numbering[A.first] != Numbering[B.first])
+ return Numbering[A.first] < Numbering[B.first];
+ ElementCountComparator ECC;
+ return ECC(A.second, B.second);
+ });
+
+ // For a list of ordered instruction-vf pairs:
+ // [(load, vf1), (load, vf2), (store, vf1)]
+ // Group the instructions together to emit separate remarks for:
+ // load (vf1, vf2)
+ // store (vf1)
+ auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
+ auto Subset = ArrayRef<InstructionVFPair>();
+ do {
+ if (Subset.empty())
+ Subset = Tail.take_front(1);
+
+ Instruction *I = Subset.front().first;
+
+ // If the next instruction is different, or if there are no other pairs,
+ // emit a remark for the collated subset. e.g.
+ // [(load, vf1), (load, vf2))]
+ // to emit:
+ // remark: invalid costs for 'load' at VF=(vf, vf2)
+ if (Subset == Tail || Tail[Subset.size()].first != I) {
+ std::string OutString;
+ raw_string_ostream OS(OutString);
+ assert(!Subset.empty() && "Unexpected empty range");
+ OS << "Instruction with invalid costs prevented vectorization at VF=(";
+ for (auto &Pair : Subset)
+ OS << (Pair.second == Subset.front().second ? "" : ", ")
+ << Pair.second;
+ OS << "):";
+ if (auto *CI = dyn_cast<CallInst>(I))
+ OS << " call to " << CI->getCalledFunction()->getName();
+ else
+ OS << " " << I->getOpcodeName();
+ OS.flush();
+ reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+ Tail = Tail.drop_front(Subset.size());
+ Subset = {};
+ } else
+ // Grow the subset by one element
+ Subset = Tail.take_front(Subset.size() + 1);
+ } while (!Tail.empty());
}
if (!EnableCondStoresVectorization && NumPredStores) {
reportVectorizationFailure("There are conditional stores.",
"store that is conditionally executed prevents vectorization",
"ConditionalStore", ORE, TheLoop);
- Width = 1;
- Cost = ScalarCost;
+ ChosenFactor = ScalarCost;
}
- LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
+ ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
- LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
- VectorizationFactor Factor = {ElementCount::getFixed(Width),
- (unsigned)(Width * Cost)};
- return Factor;
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
+ return ChosenFactor;
}
bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
@@ -5880,6 +6142,12 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
}))
return false;
+ // Epilogue vectorization code has not been auditted to ensure it handles
+ // non-latch exits properly. It may be fine, but it needs auditted and
+ // tested.
+ if (L.getExitingBlock() != L.getLoopLatch())
+ return false;
+
return true;
}
@@ -5958,7 +6226,8 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
for (auto &NextVF : ProfitableVFs)
if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
- (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
+ (Result.Width.getFixedValue() == 1 ||
+ isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
Result = NextVF;
@@ -5973,7 +6242,17 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+ for (Type *T : ElementTypesInLoop) {
+ MinWidth = std::min<unsigned>(
+ MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ MaxWidth = std::max<unsigned>(
+ MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ }
+ return {MinWidth, MaxWidth};
+}
+void LoopVectorizationCostModel::collectElementTypesForWidening() {
+ ElementTypesInLoop.clear();
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the loop.
@@ -5993,8 +6272,8 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
if (auto *PN = dyn_cast<PHINode>(&I)) {
if (!Legal->isReductionVariable(PN))
continue;
- RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
- if (PreferInLoopReductions ||
+ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
+ if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
TTI.preferInLoopReduction(RdxDesc.getOpcode(),
RdxDesc.getRecurrenceType(),
TargetTransformInfo::ReductionFlags()))
@@ -6019,14 +6298,9 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
!isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
continue;
- MinWidth = std::min(MinWidth,
- (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
- MaxWidth = std::max(MaxWidth,
- (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ ElementTypesInLoop.insert(T);
}
}
-
- return {MinWidth, MaxWidth};
}
unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
@@ -6157,8 +6431,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
- assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
- LoopCost = *expectedCost(VF).first.getValue();
+ InstructionCost C = expectedCost(VF).first;
+ assert(C.isValid() && "Expected to have chosen a VF with valid cost");
+ LoopCost = *C.getValue();
}
assert(LoopCost && "Non-zero loop cost expected");
@@ -6198,9 +6473,21 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// If we have a scalar reduction (vector reductions are already dealt with
// by this point), we can increase the critical path length if the loop
- // we're interleaving is inside another loop. Limit, by default to 2, so the
- // critical path only gets increased by one reduction operation.
+ // we're interleaving is inside another loop. For tree-wise reductions
+ // set the limit to 2, and for ordered reductions it's best to disable
+ // interleaving entirely.
if (HasReductions && TheLoop->getLoopDepth() > 1) {
+ bool HasOrderedReductions =
+ any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RdxDesc.isOrdered();
+ });
+ if (HasOrderedReductions) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
+ return 1;
+ }
+
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -6319,10 +6606,14 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// A lambda that gets the register usage for the given type and VF.
const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
- return 0U;
- return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
+ return 0;
+ InstructionCost::CostType RegUsage =
+ *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
+ assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
+ "Nonsensical values for register usage.");
+ return RegUsage;
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -6440,7 +6731,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
+ assert(isPredicatedInst(I) &&
+ "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -6469,9 +6761,11 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
for (Instruction &I : *BB)
if (isScalarWithPredication(&I)) {
ScalarCostsTy ScalarCosts;
+ // Do not apply discount if scalable, because that would lead to
+ // invalid scalarization costs.
// Do not apply discount logic if hacked cost is needed
// for emulated masked memrefs.
- if (!useEmulatedMaskMemRefHack(&I) &&
+ if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
@@ -6548,9 +6842,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
InstructionCost ScalarCost =
- VF.getKnownMinValue() *
+ VF.getFixedValue() *
getInstructionCost(I, ElementCount::getFixed(1)).first;
// Compute the scalarization overhead of needed insertelement instructions
@@ -6558,10 +6851,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
- APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ APInt::getAllOnesValue(VF.getFixedValue()), true, false);
ScalarCost +=
- VF.getKnownMinValue() *
+ VF.getFixedValue() *
TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
}
@@ -6576,10 +6868,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
if (canBeScalarized(J))
Worklist.push_back(J);
else if (needsExtract(J, VF)) {
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(J->getType(), VF)),
- APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
+ APInt::getAllOnesValue(VF.getFixedValue()), false, true);
}
}
@@ -6596,7 +6887,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
}
LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(ElementCount VF) {
+LoopVectorizationCostModel::expectedCost(
+ ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
VectorizationCostTy Cost;
// For each block.
@@ -6613,9 +6905,14 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) {
VectorizationCostTy C = getInstructionCost(&I, VF);
// Check if we should override the cost.
- if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ if (C.first.isValid() &&
+ ForceTargetInstructionCost.getNumOccurrences() > 0)
C.first = InstructionCost(ForceTargetInstructionCost);
+ // Keep a list of instructions with invalid costs.
+ if (Invalid && !C.first.isValid())
+ Invalid->emplace_back(&I, VF);
+
BlockCost.first += C.first;
BlockCost.second |= C.second;
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
@@ -6680,8 +6977,10 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
assert(VF.isVector() &&
"Scalarization cost of instruction implies vectorization.");
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Type *ValTy = getMemInstValueType(I);
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ Type *ValTy = getLoadStoreType(I);
auto SE = PSE.getSE();
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6707,12 +7006,20 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// we might create due to scalarization.
Cost += getScalarizationOverhead(I, VF);
- // If we have a predicated store, it may not be executed for each vector
- // lane. Scale the cost by the probability of executing the predicated
- // block.
+ // If we have a predicated load/store, it will need extra i1 extracts and
+ // conditional branches, but may not be executed for each vector lane. Scale
+ // the cost by the probability of executing the predicated block.
if (isPredicatedInst(I)) {
Cost /= getReciprocalPredBlockProb();
+ // Add the cost of an i1 extract and a branch
+ auto *Vec_i1Ty =
+ VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
+ Cost += TTI.getScalarizationOverhead(
+ Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+ /*Insert=*/false, /*Extract=*/true);
+ Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+
if (useEmulatedMaskMemRefHack(I))
// Artificially setting to a high enough value to practically disable
// vectorization with such operations.
@@ -6725,7 +7032,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
+ Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6745,7 +7052,8 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ Cost +=
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
return Cost;
}
@@ -6754,7 +7062,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I));
- Type *ValTy = getMemInstValueType(I);
+ Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6780,7 +7088,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
+ Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
const Value *Ptr = getLoadStorePointerOperand(I);
@@ -6794,7 +7102,12 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
+ // TODO: Once we have support for interleaving with scalable vectors
+ // we can calculate the cost properly here.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6802,7 +7115,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
// Holds the indices of existing members in an interleaved load group.
@@ -6825,17 +7137,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// TODO: Add support for reversed masked interleaved access.
assert(!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.");
- Cost += Group->getNumMembers() *
- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ Cost +=
+ Group->getNumMembers() *
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
}
return Cost;
}
-InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
+Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
- return InstructionCost::getInvalid();
+ return None;
auto *VectorTy = cast<VectorType>(Ty);
// We are looking for a pattern of, and finding the minimal acceptable cost:
@@ -6851,23 +7165,22 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
// it is not we return an invalid cost specifying the orignal cost method
// should be used.
Instruction *RetI = I;
- if ((RetI->getOpcode() == Instruction::SExt ||
- RetI->getOpcode() == Instruction::ZExt)) {
+ if (match(RetI, m_ZExtOrSExt(m_Value()))) {
if (!RetI->hasOneUser())
- return InstructionCost::getInvalid();
+ return None;
RetI = RetI->user_back();
}
- if (RetI->getOpcode() == Instruction::Mul &&
+ if (match(RetI, m_Mul(m_Value(), m_Value())) &&
RetI->user_back()->getOpcode() == Instruction::Add) {
if (!RetI->hasOneUser())
- return InstructionCost::getInvalid();
+ return None;
RetI = RetI->user_back();
}
// Test if the found instruction is a reduction, and if not return an invalid
// cost specifying the parent to use the original cost modelling.
if (!InLoopReductionImmediateChains.count(RetI))
- return InstructionCost::getInvalid();
+ return None;
// Find the reduction this chain is a part of and calculate the basic cost of
// the reduction on its own.
@@ -6876,10 +7189,17 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
while (!isa<PHINode>(ReductionPhi))
ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
- RecurrenceDescriptor RdxDesc =
+ const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
- unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
- VectorTy, false, CostKind);
+
+ InstructionCost BaseCost = TTI.getArithmeticReductionCost(
+ RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+
+ // If we're using ordered reductions then we can just return the base cost
+ // here, since getArithmeticReductionCost calculates the full ordered
+ // reduction cost when FP reassociation is not allowed.
+ if (useOrderedReductions(RdxDesc))
+ return BaseCost;
// Get the operand that was not the reduction chain and match it to one of the
// patterns, returning the better cost if it is found.
@@ -6889,56 +7209,57 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
- if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
+ Instruction *Op0, *Op1;
+ if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
!TheLoop->isLoopInvariant(RedOp)) {
+ // Matched reduce(ext(A))
bool IsUnsigned = isa<ZExtInst>(RedOp);
auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
InstructionCost RedCost = TTI.getExtendedAddReductionCost(
/*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
CostKind);
- unsigned ExtCost =
+ InstructionCost ExtCost =
TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
TTI::CastContextHint::None, CostKind, RedOp);
if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
- return I == RetI ? *RedCost.getValue() : 0;
- } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
- Instruction *Mul = RedOp;
- Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
- Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
- if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
+ return I == RetI ? RedCost : 0;
+ } else if (RedOp &&
+ match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
+ if (match(Op0, m_ZExtOrSExt(m_Value())) &&
Op0->getOpcode() == Op1->getOpcode() &&
Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
bool IsUnsigned = isa<ZExtInst>(Op0);
auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
- // reduce(mul(ext, ext))
- unsigned ExtCost =
+ // Matched reduce(mul(ext, ext))
+ InstructionCost ExtCost =
TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
TTI::CastContextHint::None, CostKind, Op0);
- unsigned MulCost =
- TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+ InstructionCost MulCost =
+ TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getExtendedAddReductionCost(
/*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
CostKind);
if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
- return I == RetI ? *RedCost.getValue() : 0;
+ return I == RetI ? RedCost : 0;
} else {
- unsigned MulCost =
- TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
+ // Matched reduce(mul())
+ InstructionCost MulCost =
+ TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
InstructionCost RedCost = TTI.getExtendedAddReductionCost(
/*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
CostKind);
if (RedCost.isValid() && RedCost < MulCost + BaseCost)
- return I == RetI ? *RedCost.getValue() : 0;
+ return I == RetI ? RedCost : 0;
}
}
- return I == RetI ? BaseCost : InstructionCost::getInvalid();
+ return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
}
InstructionCost
@@ -6947,7 +7268,7 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF.isScalar()) {
- Type *ValTy = getMemInstValueType(I);
+ Type *ValTy = getLoadStoreType(I);
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6991,10 +7312,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
- ElementCount VF) {
+ ElementCount VF) const {
+
+ // There is no mechanism yet to create a scalable scalarization loop,
+ // so this is currently Invalid.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
- assert(!VF.isScalable() &&
- "cannot compute scalarization overhead for scalable vectorization");
if (VF.isScalar())
return 0;
@@ -7020,8 +7344,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
+ SmallVector<Type *> Tys;
+ for (auto *V : filterExtractingOperands(Ops, VF))
+ Tys.push_back(MaybeVectorizeType(V->getType(), VF));
return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
+ filterExtractingOperands(Ops, VF), Tys);
}
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
@@ -7047,8 +7374,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- InstructionCost Cost = getUniformMemOpCost(&I, VF);
- setWideningDecision(&I, VF, CM_Scalarize, Cost);
+ InstructionCost Cost;
+ if (isa<StoreInst>(&I) && VF.isScalable() &&
+ isLegalGatherOrScatter(&I)) {
+ Cost = getGatherScatterCost(&I, VF);
+ setWideningDecision(&I, VF, CM_GatherScatter, Cost);
+ } else {
+ assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
+ "Cannot yet scalarize uniform stores");
+ Cost = getUniformMemOpCost(&I, VF);
+ setWideningDecision(&I, VF, CM_Scalarize, Cost);
+ }
continue;
}
@@ -7066,7 +7402,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
// Choose between Interleaving, Gather/Scatter or Scalarization.
- InstructionCost InterleaveCost = std::numeric_limits<int>::max();
+ InstructionCost InterleaveCost = InstructionCost::getInvalid();
unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);
@@ -7084,7 +7420,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
InstructionCost GatherScatterCost =
isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
- : std::numeric_limits<int>::max();
+ : InstructionCost::getInvalid();
InstructionCost ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;
@@ -7181,10 +7517,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
- VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto hasSingleCopyAfterVectorization = [this](Instruction *I,
+ ElementCount VF) -> bool {
+ if (VF.isScalar())
+ return true;
+
+ auto Scalarized = InstsToScalarize.find(VF);
+ assert(Scalarized != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return !Scalarized->second.count(I) &&
+ llvm::all_of(I->users(), [&](User *U) {
+ auto *UI = cast<Instruction>(U);
+ return !Scalarized->second.count(UI);
+ });
+ };
+ (void) hasSingleCopyAfterVectorization;
+
+ if (isScalarAfterVectorization(I, VF)) {
+ // With the exception of GEPs and PHIs, after scalarization there should
+ // only be one copy of the instruction generated in the loop. This is
+ // because the VF is either 1, or any instructions that need scalarizing
+ // have already been dealt with by the the time we get here. As a result,
+ // it means we don't have to multiply the instruction cost by VF.
+ assert(I->getOpcode() == Instruction::GetElementPtr ||
+ I->getOpcode() == Instruction::PHI ||
+ (I->getOpcode() == Instruction::BitCast &&
+ I->getType()->isPointerTy()) ||
+ hasSingleCopyAfterVectorization(I, VF));
+ VectorTy = RetTy;
+ } else
+ VectorTy = ToVectorTy(RetTy, VF);
+
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
@@ -7205,15 +7571,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
+ // Not possible to scalarize scalable vector with predicated instructions.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
// Return cost for branches around scalarized and predicated blocks.
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
- return (TTI.getScalarizationOverhead(
- Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
- false, true) +
- (TTI.getCFInstrCost(Instruction::Br, CostKind) *
- VF.getKnownMinValue()));
+ return (
+ TTI.getScalarizationOverhead(
+ Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false,
+ true) +
+ (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
return TTI.getCFInstrCost(Instruction::Br, CostKind);
@@ -7232,7 +7600,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
return TTI.getShuffleCost(
TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
- VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
+ None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
@@ -7297,10 +7665,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return 0;
// Detect reduction patterns
- InstructionCost RedCost;
- if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
- .isValid())
- return RedCost;
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ return *RedCost;
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
@@ -7312,26 +7678,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Op2VK = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
- unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+ return TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
- I->getOperand(0), I);
+ return TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None, I->getOperand(0), I);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+
+ const Value *Op0, *Op1;
+ using namespace llvm::PatternMatch;
+ if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
+ match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
+ // select x, y, false --> x & y
+ // select x, true, y --> x | y
+ TTI::OperandValueProperties Op1VP = TTI::OP_None;
+ TTI::OperandValueProperties Op2VP = TTI::OP_None;
+ TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
+ TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
+ assert(Op0->getType()->getScalarSizeInBits() == 1 &&
+ Op1->getType()->getScalarSizeInBits() == 1);
+
+ SmallVector<const Value *, 2> Operands{Op0, Op1};
+ return TTI.getArithmeticInstrCost(
+ match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
+ CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
+ }
+
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
@@ -7358,9 +7738,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
if (Decision == CM_Scalarize)
Width = ElementCount::getFixed(1);
}
- VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+ VectorTy = ToVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
}
+ case Instruction::BitCast:
+ if (I->getType()->isPointerTy())
+ return 0;
+ LLVM_FALLTHROUGH;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -7371,8 +7755,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
+ case Instruction::FPTrunc: {
// Computes the CastContextHint from a Load/Store instruction.
auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7424,10 +7807,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
}
// Detect reduction patterns
- InstructionCost RedCost;
- if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
- .isValid())
- return RedCost;
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ return *RedCost;
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy =
@@ -7450,10 +7831,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
}
}
- assert(!VF.isScalable() && "VF is assumed to be non scalable");
- unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N *
- TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
+ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
@@ -7467,12 +7845,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
}
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+ case Instruction::Alloca:
+ // We cannot easily widen alloca to a scalable alloca, as
+ // the result would need to be a vector of pointers.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+ LLVM_FALLTHROUGH;
default:
- // The cost of executing VF copies of the scalar instruction. This opcode
- // is unknown. Assume that it is the same as 'mul'.
- return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
- Instruction::Mul, VectorTy, CostKind) +
- getScalarizationOverhead(I, VF);
+ // This opcode is unknown. Assume that it is the same as 'mul'.
+ return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
} // end of switch.
}
@@ -7548,7 +7929,7 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
// If the target would prefer this reduction to happen "in-loop", then we
// want to record it as such.
unsigned Opcode = RdxDesc.getOpcode();
- if (!PreferInLoopReductions &&
+ if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
!TTI.preferInLoopReduction(Opcode, Phi->getType(),
TargetTransformInfo::ReductionFlags()))
continue;
@@ -7597,8 +7978,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
// If the user doesn't provide a vectorization factor, determine a
// reasonable one.
if (UserVF.isZero()) {
- VF = ElementCount::getFixed(
- determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
+ VF = ElementCount::getFixed(determineVPlanVF(
+ TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize(),
+ CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
// Make sure we have a VF > 1 for stress testing.
@@ -7631,8 +8014,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
- if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+ FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
+ if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return None;
// Invalidate interleave groups if all blocks of loop will be predicated.
@@ -7649,34 +8032,35 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
- ElementCount MaxVF = MaybeMaxVF.getValue();
- assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
- bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
- if (!UserVF.isZero() &&
- (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
- // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
- // VFs here, this should be reverted to only use legal UserVFs once the
- // loop below supports scalable VFs.
- ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
- LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
- << " VF " << VF << ".\n");
- assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ ElementCount MaxUserVF =
+ UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
+ bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
+ if (!UserVF.isZero() && UserVFIsLegal) {
+ assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.selectUserVectorizationFactor(VF);
- CM.collectInLoopReductions();
- buildVPlansWithVPRecipes(VF, VF);
- LLVM_DEBUG(printPlans(dbgs()));
- return {{VF, 0}};
+ if (CM.selectUserVectorizationFactor(UserVF)) {
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(UserVF, UserVF);
+ LLVM_DEBUG(printPlans(dbgs()));
+ return {{UserVF, 0}};
+ } else
+ reportVectorizationInfo("UserVF ignored because of invalid costs.",
+ "InvalidCost", ORE, OrigLoop);
}
- assert(!MaxVF.isScalable() &&
- "Scalable vectors not yet supported beyond this point");
+ // Populate the set of Vectorization Factor Candidates.
+ ElementCountSet VFCandidates;
+ for (auto VF = ElementCount::getFixed(1);
+ ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
+ VFCandidates.insert(VF);
+ for (auto VF = ElementCount::getScalable(1);
+ ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
+ VFCandidates.insert(VF);
- for (ElementCount VF = ElementCount::getFixed(1);
- ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
+ for (const auto &VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
@@ -7687,14 +8071,38 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
}
CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+ buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
- buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
- if (MaxVF.isScalar())
+ if (!MaxFactors.hasVector())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- return CM.selectVectorizationFactor(MaxVF);
+ auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
+
+ // Check if it is profitable to vectorize with runtime checks.
+ unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+ if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(
+ DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Hints.emitRemarkWithHints();
+ return VectorizationFactor::Disabled();
+ }
+ }
+ return SelectedVF;
}
void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
@@ -7714,19 +8122,11 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
// Perform the actual loop transformation.
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
- VPCallbackILV CallbackILV(ILV);
-
assert(BestVF.hasValue() && "Vectorization Factor is missing");
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
- VPTransformState State{*BestVF,
- BestUF,
- OrigLoop,
- LI,
- DT,
- ILV.Builder,
- ILV.VectorLoopValueMap,
- &ILV,
- CallbackILV};
+ VPTransformState State{
+ *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
@@ -7742,16 +8142,25 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
VPlans.front()->execute(&State);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
- ILV.fixVectorizedLoop();
+ ILV.fixVectorizedLoop(State);
ILV.printDebugTracesAtEnd();
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ if (PrintVPlansInDotFormat)
+ Plan->printDOT(O);
+ else
+ Plan->print(O);
+}
+#endif
+
void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
@@ -7822,9 +8231,9 @@ Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
if (Ty->isFloatingPointTy()) {
Constant *C = ConstantFP::get(Ty, (double)StartIdx);
- // Floating point operations had to be 'fast' to enable the unrolling.
- Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
- return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+ // Floating-point operations inherit FMF via the builder's flags.
+ Value *MulOp = Builder.CreateFMul(C, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp);
}
Constant *C = ConstantInt::get(Ty, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
@@ -7882,22 +8291,12 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- BasicBlock *SavedPreHeader = LoopVectorPreHeader;
- emitSCEVChecks(Lp, LoopScalarPreHeader);
-
- // If a safety check was generated save it.
- if (SavedPreHeader != LoopVectorPreHeader)
- EPI.SCEVSafetyCheck = SavedPreHeader;
+ EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
// Generate the code that checks at runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- SavedPreHeader = LoopVectorPreHeader;
- emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
- // If a safety check was generated save/overwite it.
- if (SavedPreHeader != LoopVectorPreHeader)
- EPI.MemSafetyCheck = SavedPreHeader;
+ EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
@@ -7958,8 +8357,8 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
// Generate code to check if the loop's trip count is less than VF * UF of the
// main vector loop.
- auto P =
- Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
+ ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
@@ -7979,7 +8378,11 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
// Update dominator for Bypass & LoopExit.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ // For loops with multiple exits, there's no edge from the middle block
+ // to exit blocks (as the epilogue must run) and thus no need to update
+ // the immediate dominator of the exit blocks.
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
LoopBypassBlocks.push_back(TCCheckBlock);
@@ -8043,7 +8446,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ // If there is an epilogue which must run, there's no edge from the
+ // middle block to exit blocks and thus no need to update the immediate
+ // dominator of the exit blocks.
+ DT->changeImmediateDominator(LoopExitBlock,
+ EPI.EpilogueIterationCountCheck);
// Keep track of bypass blocks, as they feed start values to the induction
// phis in the scalar loop preheader.
@@ -8102,8 +8510,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
// Generate code to check if the loop's trip count is less than VF * UF of the
// vector epilogue loop.
- auto P =
- Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
+ ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count,
@@ -8122,9 +8530,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
LLVM_DEBUG({
dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
- << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
- << ", Main Loop UF:" << EPI.MainLoopUF
- << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
});
}
@@ -8196,8 +8602,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (BI->getSuccessor(0) != Dst)
EdgeMask = Builder.createNot(EdgeMask);
- if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
- EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+ if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+ // The condition is 'SrcMask && EdgeMask', which is equivalent to
+ // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ // The select version does not introduce new UB if SrcMask is false and
+ // EdgeMask is poison. Using 'and' here introduces undefined behavior.
+ VPValue *False = Plan->getOrAddVPValue(
+ ConstantInt::getFalse(BI->getCondition()->getType()));
+ EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+ }
return EdgeMaskCache[Edge] = EdgeMask;
}
@@ -8232,7 +8645,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
else {
auto IVRecipe = new VPWidenCanonicalIVRecipe();
Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
- IV = IVRecipe->getVPValue();
+ IV = IVRecipe->getVPSingleValue();
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
@@ -8266,7 +8679,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask;
}
-VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range,
VPlanPtr &Plan) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");
@@ -8293,32 +8708,35 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
if (Legal->isMaskRequired(I))
Mask = createBlockInMask(I->getParent(), Plan);
- VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
+ return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
StoreInst *Store = cast<StoreInst>(I);
- VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
- return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
+ return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
+ Mask);
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
+ ArrayRef<VPValue *> Operands) const {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
if (II.getKind() == InductionDescriptor::IK_IntInduction ||
II.getKind() == InductionDescriptor::IK_FpInduction) {
- VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return new VPWidenIntOrFpInductionRecipe(Phi, Start);
+ assert(II.getStartValue() ==
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+ const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
+ return new VPWidenIntOrFpInductionRecipe(
+ Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
}
return nullptr;
}
-VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
- VPlan &Plan) const {
+VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
+ TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
+ VPlan &Plan) const {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8340,39 +8758,49 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- Start, I);
+ Start, nullptr, I);
}
return nullptr;
}
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
+VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
+ ArrayRef<VPValue *> Operands,
+ VPlanPtr &Plan) {
+ // If all incoming values are equal, the incoming VPValue can be used directly
+ // instead of creating a new VPBlendRecipe.
+ VPValue *FirstIncoming = Operands[0];
+ if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
+ return FirstIncoming == Inc;
+ })) {
+ return Operands[0];
+ }
+
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
-
- SmallVector<VPValue *, 2> Operands;
+ SmallVector<VPValue *, 2> OperandsWithMask;
unsigned NumIncoming = Phi->getNumIncomingValues();
+
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
assert((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask");
- Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
+ OperandsWithMask.push_back(Operands[In]);
if (EdgeMask)
- Operands.push_back(EdgeMask);
+ OperandsWithMask.push_back(EdgeMask);
}
- return new VPBlendRecipe(Phi, Operands);
+ return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
}
-VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
- VPlan &Plan) const {
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](ElementCount VF) {
- return CM.isScalarWithPredication(CI, VF);
- },
+ [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
Range);
if (IsPredicated)
@@ -8395,15 +8823,14 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- assert(IntrinsicCost.isValid() && CallCost.isValid() &&
- "Cannot have invalid costs while widening");
return UseVectorIntrinsic || !NeedToScalarize;
};
if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return nullptr;
- return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
+ ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
}
bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
@@ -8413,14 +8840,14 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
// scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF) ||
- CM.isScalarWithPredication(I, VF);
+ CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
};
return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
Range);
}
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
+ ArrayRef<VPValue *> Operands) const {
auto IsVectorizableOpcode = [](unsigned Opcode) {
switch (Opcode) {
case Instruction::Add:
@@ -8466,20 +8893,28 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
return nullptr;
// Success: widen this instruction.
- return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
+ return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
+}
+
+void VPRecipeBuilder::fixHeaderPhis() {
+ BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
+ for (VPWidenPHIRecipe *R : PhisToFix) {
+ auto *PN = cast<PHINode>(R->getUnderlyingValue());
+ VPRecipeBase *IncR =
+ getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
+ R->addOperand(IncR->getVPSingleValue());
+ }
}
VPBasicBlock *VPRecipeBuilder::handleReplication(
Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
VPlanPtr &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
- Range);
+ [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
IsUniform, IsPredicated);
@@ -8489,10 +8924,16 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
// a vector value, as that happens iff all users use the vector value.
- for (auto &Op : I->operands())
- if (auto *PredInst = dyn_cast<Instruction>(Op))
- if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
- PredInst2Recipe[PredInst]->setAlsoPack(false);
+ for (VPValue *Op : Recipe->operands()) {
+ auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
+ if (!PredR)
+ continue;
+ auto *RepR =
+ cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
+ assert(RepR->isPredicated() &&
+ "expected Replicate recipe to be predicated");
+ RepR->setAlsoPack(false);
+ }
// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
@@ -8504,7 +8945,6 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
assert(VPBB->getSuccessors().empty() &&
"VPBB has successors when handling predicated replication.");
// Record predicated instructions for above packing optimizations.
- PredInst2Recipe[I] = Recipe;
VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
@@ -8529,6 +8969,10 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
auto *PHIRecipe = Instr->getType()->isVoidTy()
? nullptr
: new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
+ if (PHIRecipe) {
+ Plan->removeVPValueFor(Instr);
+ Plan->addVPValue(Instr, PHIRecipe);
+ }
auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
@@ -8541,53 +8985,75 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
return Region;
}
-VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
- VFRange &Range,
- VPlanPtr &Plan) {
+VPRecipeOrVPValueTy
+VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+ ArrayRef<VPValue *> Operands,
+ VFRange &Range, VPlanPtr &Plan) {
// First, check for specific widening recipes that deal with calls, memory
// operations, inductions and Phi nodes.
if (auto *CI = dyn_cast<CallInst>(Instr))
- return tryToWidenCall(CI, Range, *Plan);
+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return tryToWidenMemory(Instr, Range, Plan);
+ return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
VPRecipeBase *Recipe;
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
- return tryToBlend(Phi, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
- return Recipe;
-
- if (Legal->isReductionVariable(Phi)) {
- RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
- VPValue *StartV =
- Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
- return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
+ return tryToBlend(Phi, Operands, Plan);
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
+ return toVPRecipeResult(Recipe);
+
+ VPWidenPHIRecipe *PhiRecipe = nullptr;
+ if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
+ VPValue *StartV = Operands[0];
+ if (Legal->isReductionVariable(Phi)) {
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ assert(RdxDesc.getRecurrenceStartValue() ==
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+ PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+ CM.isInLoopReduction(Phi),
+ CM.useOrderedReductions(RdxDesc));
+ } else {
+ PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
+ }
+
+ // Record the incoming value from the backedge, so we can add the incoming
+ // value from the backedge after all recipes have been created.
+ recordRecipeOf(cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+ PhisToFix.push_back(PhiRecipe);
+ } else {
+ // TODO: record start and backedge value for remaining pointer induction
+ // phis.
+ assert(Phi->getType()->isPointerTy() &&
+ "only pointer phis should be handled here");
+ PhiRecipe = new VPWidenPHIRecipe(Phi);
}
- return new VPWidenPHIRecipe(Phi);
+ return toVPRecipeResult(PhiRecipe);
}
- if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
- cast<TruncInst>(Instr), Range, *Plan)))
- return Recipe;
+ if (isa<TruncInst>(Instr) &&
+ (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
+ Range, *Plan)))
+ return toVPRecipeResult(Recipe);
if (!shouldWiden(Instr, Range))
return nullptr;
if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
- return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
- OrigLoop);
+ return toVPRecipeResult(new VPWidenGEPRecipe(
+ GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
bool InvariantCond =
PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
- return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
- InvariantCond);
+ return toVPRecipeResult(new VPWidenSelectRecipe(
+ *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
}
- return tryToWiden(Instr, *Plan);
+ return toVPRecipeResult(tryToWiden(Instr, Operands));
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8610,11 +9076,29 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
auto &ConditionalAssumes = Legal->getConditionalAssumes();
DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
// Dead instructions do not need sinking. Remove them from SinkAfter.
for (Instruction *I : DeadInstructions)
SinkAfter.erase(I);
+ // Cannot sink instructions after dead instructions (there won't be any
+ // recipes for them). Instead, find the first non-dead previous instruction.
+ for (auto &P : Legal->getSinkAfter()) {
+ Instruction *SinkTarget = P.second;
+ Instruction *FirstInst = &*SinkTarget->getParent()->begin();
+ (void)FirstInst;
+ while (DeadInstructions.contains(SinkTarget)) {
+ assert(
+ SinkTarget != FirstInst &&
+ "Must find a live instruction (at least the one feeding the "
+ "first-order recurrence PHI) before reaching beginning of the block");
+ SinkTarget = SinkTarget->getPrevNode();
+ assert(SinkTarget != P.first &&
+ "sink source equals target, no sinking required");
+ }
+ P.second = SinkTarget;
+ }
+
auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
VFRange SubRange = {VF, MaxVFPlusOne};
@@ -8626,12 +9110,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const DenseMap<Instruction *, Instruction *> &SinkAfter) {
-
- // Hold a mapping from predicated instructions to their recipes, in order to
- // fix their AlsoPack behavior if a user is determined to replicate and use a
- // scalar instead of vector value.
- DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+ const MapVector<Instruction *, Instruction *> &SinkAfter) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8715,8 +9194,29 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
continue;
- if (auto Recipe =
- RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+ SmallVector<VPValue *, 4> Operands;
+ auto *Phi = dyn_cast<PHINode>(Instr);
+ if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
+ Operands.push_back(Plan->getOrAddVPValue(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
+ } else {
+ auto OpRange = Plan->mapToVPValues(Instr->operands());
+ Operands = {OpRange.begin(), OpRange.end()};
+ }
+ if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
+ Instr, Operands, Range, Plan)) {
+ // If Instr can be simplified to an existing VPValue, use it.
+ if (RecipeOrValue.is<VPValue *>()) {
+ auto *VPV = RecipeOrValue.get<VPValue *>();
+ Plan->addVPValue(Instr, VPV);
+ // If the re-used value is a recipe, register the recipe for the
+ // instruction, in case the recipe for Instr needs to be recorded.
+ if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
+ RecipeBuilder.setRecipe(Instr, R);
+ continue;
+ }
+ // Otherwise, add the new recipe.
+ VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
for (auto *Def : Recipe->definedValues()) {
auto *UV = Def->getUnderlyingValue();
Plan->addVPValue(UV, Def);
@@ -8729,8 +9229,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
- Instr, Range, VPBB, PredInst2Recipe, Plan);
+ VPBasicBlock *NextVPBB =
+ RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
if (NextVPBB != VPBB) {
VPBB = NextVPBB;
VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
@@ -8739,6 +9239,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ RecipeBuilder.fixHeaderPhis();
+
// Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
// may also be empty, such as the last one VPBB, reflecting original
// basic-blocks with no recipes.
@@ -8754,22 +9256,89 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// ---------------------------------------------------------------------------
// Apply Sink-After legal constraints.
+ auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
+ auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ if (Region && Region->isReplicator()) {
+ assert(Region->getNumSuccessors() == 1 &&
+ Region->getNumPredecessors() == 1 && "Expected SESE region!");
+ assert(R->getParent()->size() == 1 &&
+ "A recipe in an original replicator region must be the only "
+ "recipe in its block");
+ return Region;
+ }
+ return nullptr;
+ };
for (auto &Entry : SinkAfter) {
VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
- // If the target is in a replication region, make sure to move Sink to the
- // block after it, not into the replication region itself.
- if (auto *Region =
- dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
- if (Region->isReplicator()) {
- assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
+
+ auto *TargetRegion = GetReplicateRegion(Target);
+ auto *SinkRegion = GetReplicateRegion(Sink);
+ if (!SinkRegion) {
+ // If the sink source is not a replicate region, sink the recipe directly.
+ if (TargetRegion) {
+ // The target is in a replication region, make sure to move Sink to
+ // the block after it, not into the replication region itself.
VPBasicBlock *NextBlock =
- cast<VPBasicBlock>(Region->getSuccessors().front());
+ cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
- continue;
- }
+ } else
+ Sink->moveAfter(Target);
+ continue;
+ }
+
+ // The sink source is in a replicate region. Unhook the region from the CFG.
+ auto *SinkPred = SinkRegion->getSinglePredecessor();
+ auto *SinkSucc = SinkRegion->getSingleSuccessor();
+ VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
+ VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
+ VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
+
+ if (TargetRegion) {
+ // The target recipe is also in a replicate region, move the sink region
+ // after the target region.
+ auto *TargetSucc = TargetRegion->getSingleSuccessor();
+ VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
+ VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
+ VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
+ } else {
+ // The sink source is in a replicate region, we need to move the whole
+ // replicate region, which should only contain a single recipe in the
+ // main block.
+ auto *SplitBlock =
+ Target->getParent()->splitAt(std::next(Target->getIterator()));
+
+ auto *SplitPred = SplitBlock->getSinglePredecessor();
+
+ VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
+ VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
+ VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
+ if (VPBB == SplitPred)
+ VPBB = SplitBlock;
}
- Sink->moveAfter(Target);
+ }
+
+ // Introduce a recipe to combine the incoming and previous values of a
+ // first-order recurrence.
+ for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
+ if (!RecurPhi)
+ continue;
+
+ auto *RecurSplice = cast<VPInstruction>(
+ Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
+ {RecurPhi, RecurPhi->getBackedgeValue()}));
+
+ VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
+ if (auto *Region = GetReplicateRegion(PrevRecipe)) {
+ VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
+ RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
+ } else
+ RecurSplice->moveAfter(PrevRecipe);
+ RecurPhi->replaceAllUsesWith(RecurSplice);
+ // Set the first operand of RecurSplice to RecurPhi again, after replacing
+ // all users.
+ RecurSplice->setOperand(0, RecurPhi);
}
// Interleave memory: for each Interleave Group we marked earlier as relevant
@@ -8780,8 +9349,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
- StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
+ if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
+ auto *StoreR =
+ cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
+ StoredValues.push_back(StoreR->getStoredValue());
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
Recipe->getMask());
@@ -8801,8 +9373,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
// Adjust the recipes for any inloop reductions.
- if (Range.Start.isVector())
- adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
+ adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start);
// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
@@ -8818,6 +9389,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ VPlanTransforms::sinkScalarOperands(*Plan);
+ VPlanTransforms::mergeReplicateRegions(*Plan);
+
std::string PlanName;
raw_string_ostream RSO(PlanName);
ElementCount VF = Range.Start;
@@ -8863,8 +9437,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
}
SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanTransforms::VPInstructionsToVPRecipes(
- OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
+ VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
+ Legal->getInductionVars(),
+ DeadInstructions, *PSE.getSE());
return Plan;
}
@@ -8873,12 +9448,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// reductions, with one operand being vector and the other being the scalar
// reduction chain.
void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
- VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+ if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
+ continue;
+
// ReductionOperations are orders top-down from the phi's use to the
// LoopExitValue. We keep a track of the previous item (the Chain) to tell
// which of the two operands will remain scalar and which will be reduced.
@@ -8895,7 +9473,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
"Expected to replace a VPWidenSelectSC");
FirstOpId = 1;
} else {
- assert(isa<VPWidenRecipe>(WidenRecipe) &&
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
"Expected to replace a VPWidenSC");
FirstOpId = 0;
}
@@ -8907,12 +9485,12 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
VPReductionRecipe *RedRecipe = new VPReductionRecipe(
- &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
- WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+ WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
- WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
+ WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
WidenRecipe->eraseFromParent();
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
@@ -8929,19 +9507,10 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
}
}
-Value* LoopVectorizationPlanner::VPCallbackILV::
-getOrCreateVectorValues(Value *V, unsigned Part) {
- return ILV.getOrCreateVectorValue(V, Part);
-}
-
-Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
- Value *V, const VPIteration &Instance) {
- return ILV.getOrCreateScalarValue(V, Instance);
-}
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
- O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
getAddr()->printAsOperand(O, SlotTracker);
@@ -8952,8 +9521,9 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *I = IG->getMember(i))
- O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
+ O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i;
}
+#endif
void VPWidenCallRecipe::execute(VPTransformState &State) {
State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
@@ -8978,17 +9548,17 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
- Trunc);
+ getTruncInst(), getVPValue(0),
+ getCastValue(), State);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- Value *StartV =
- getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
- State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
+ State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
+ State);
}
void VPBlendRecipe::execute(VPTransformState &State) {
- State.ILV->setDebugLocFromInst(State.Builder, Phi);
+ State.ILV->setDebugLocFromInst(Phi, &State.Builder);
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
@@ -9023,7 +9593,7 @@ void VPBlendRecipe::execute(VPTransformState &State) {
}
}
for (unsigned Part = 0; Part < State.UF; ++Part)
- State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+ State.set(this, Entry[Part], Part);
}
void VPInterleaveRecipe::execute(VPTransformState &State) {
@@ -9034,53 +9604,66 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Reduction being replicated.");
+ Value *PrevInChain = State.get(getChainOp(), 0);
for (unsigned Part = 0; Part < State.UF; ++Part) {
RecurKind Kind = RdxDesc->getRecurrenceKind();
+ bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, Part);
VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
- Kind, VecTy->getElementType());
+ Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
Constant *IdenVec =
ConstantVector::getSplat(VecTy->getElementCount(), Iden);
Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
NewVecOp = Select;
}
- Value *NewRed =
- createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
- Value *PrevInChain = State.get(getChainOp(), Part);
+ Value *NewRed;
Value *NextInChain;
+ if (IsOrdered) {
+ if (State.VF.isVector())
+ NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
+ PrevInChain);
+ else
+ NewRed = State.Builder.CreateBinOp(
+ (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
+ PrevInChain, NewVecOp);
+ PrevInChain = NewRed;
+ } else {
+ PrevInChain = State.get(getChainOp(), Part);
+ NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+ }
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
NextInChain =
createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
NewRed, PrevInChain);
- } else {
+ } else if (IsOrdered)
+ NextInChain = NewRed;
+ else {
NextInChain = State.Builder.CreateBinOp(
(Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
PrevInChain);
}
- State.set(this, getUnderlyingInstr(), NextInChain, Part);
+ State.set(this, NextInChain, Part);
}
}
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
*State.Instance, IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from poison.
- if (State.Instance->Lane == 0) {
+ if (State.Instance->Lane.isFirstLane()) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
Value *Poison = PoisonValue::get(
VectorType::get(getUnderlyingValue()->getType(), State.VF));
- State.ValueMap.setVectorValue(getUnderlyingInstr(),
- State.Instance->Part, Poison);
+ State.set(this, Poison, State.Instance->Part);
}
- State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
- *State.Instance);
+ State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
}
return;
}
@@ -9093,15 +9676,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
"Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
+ VPIteration(Part, Lane), IsPredicated,
+ State);
}
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.");
unsigned Part = State.Instance->Part;
- unsigned Lane = State.Instance->Lane;
+ unsigned Lane = State.Instance->Lane.getKnownLane();
Value *ConditionBit = nullptr;
VPValue *BlockInMask = getMask();
@@ -9130,6 +9714,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.");
+ assert(isa<VPReplicateRecipe>(getOperand(0)) &&
+ "operand must be VPReplicateRecipe");
// By current pack/unpack logic we need to generate only a single phi node: if
// a vector value for the predicated instruction exists at this point it means
@@ -9138,29 +9724,40 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
unsigned Part = State.Instance->Part;
- Instruction *PredInst =
- cast<Instruction>(getOperand(0)->getUnderlyingValue());
- if (State.ValueMap.hasVectorValue(PredInst, Part)) {
- Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ if (State.hasVectorValue(getOperand(0), Part)) {
+ Value *VectorValue = State.get(getOperand(0), Part);
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
- State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ if (State.hasVectorValue(this, Part))
+ State.reset(this, VPhi, Part);
+ else
+ State.set(this, VPhi, Part);
+ // NOTE: Currently we need to update the value of the operand, so the next
+ // predicated iteration inserts its generated value in the correct vector.
+ State.reset(getOperand(0), VPhi, Part);
} else {
- Type *PredInstType = PredInst->getType();
+ Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
- Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
+ PredicatingBB);
Phi->addIncoming(ScalarPredInst, PredicatedBB);
- State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ if (State.hasScalarValue(this, *State.Instance))
+ State.reset(this, Phi, *State.Instance);
+ else
+ State.set(this, Phi, *State.Instance);
+ // NOTE: Currently we need to update the value of the operand, so the next
+ // predicated iteration inserts its generated value in the correct vector.
+ State.reset(getOperand(0), Phi, *State.Instance);
}
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
- State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
- StoredValue ? nullptr : getVPValue(),
- getAddr(), StoredValue, getMask());
+ State.ILV->vectorizeMemoryInstruction(
+ &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
+ StoredValue, getMask());
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9213,10 +9810,71 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
return CM_ScalarEpilogueAllowed;
}
-void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
- unsigned Part) {
- set(Def, V, Part);
- ILV->setVectorValue(IRDef, Part, V);
+Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+ // If Values have been set for this Def return the one relevant for \p Part.
+ if (hasVectorValue(Def, Part))
+ return Data.PerPartOutput[Def][Part];
+
+ if (!hasScalarValue(Def, {Part, 0})) {
+ Value *IRV = Def->getLiveInIRValue();
+ Value *B = ILV->getBroadcastInstrs(IRV);
+ set(Def, B, Part);
+ return B;
+ }
+
+ Value *ScalarValue = get(Def, {Part, 0});
+ // If we aren't vectorizing, we can just copy the scalar map values over
+ // to the vector map.
+ if (VF.isScalar()) {
+ set(Def, ScalarValue, Part);
+ return ScalarValue;
+ }
+
+ auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
+ bool IsUniform = RepR && RepR->isUniform();
+
+ unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
+ // Check if there is a scalar value for the selected lane.
+ if (!hasScalarValue(Def, {Part, LastLane})) {
+ // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
+ assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
+ "unexpected recipe found to be invariant");
+ IsUniform = true;
+ LastLane = 0;
+ }
+
+ auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
+ // Set the insert point after the last scalarized instruction or after the
+ // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
+ // will directly follow the scalar definitions.
+ auto OldIP = Builder.saveIP();
+ auto NewIP =
+ isa<PHINode>(LastInst)
+ ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
+ : std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using
+ // insertelement instructions. Since the resulting vectors are stored in
+ // State, we will only generate the insertelements once.
+ Value *VectorValue = nullptr;
+ if (IsUniform) {
+ VectorValue = ILV->getBroadcastInstrs(ScalarValue);
+ set(Def, VectorValue, Part);
+ } else {
+ // Initialize packing with insertelements to start from undef.
+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+ Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+ set(Def, Undef, Part);
+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+ ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
+ VectorValue = get(Def, Part);
+ }
+ Builder.restoreIP(OldIP);
+ return VectorValue;
}
// Process the loop in the VPlan-native vectorization path. This path builds
@@ -9228,7 +9886,8 @@ static bool processLoopInVPlanNativePath(
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
+ LoopVectorizationRequirements &Requirements) {
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9246,11 +9905,14 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
+ Requirements, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
+ CM.collectElementTypesForWidening();
+
// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -9263,19 +9925,67 @@ static bool processLoopInVPlanNativePath(
LVP.setBestPlan(VF.Width, 1);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM, BFI, PSI);
- LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
- << L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(LB, DT);
+ {
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ F->getParent()->getDataLayout());
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
+ &CM, BFI, PSI, Checks);
+ LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+ << L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.executePlan(LB, DT);
+ }
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
-
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
+// Emit a remark if there are stores to floats that required a floating point
+// extension. If the vectorized loop was generated with floating point there
+// will be a performance penalty from the conversion overhead and the change in
+// the vector width.
+static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
+ SmallVector<Instruction *, 4> Worklist;
+ for (BasicBlock *BB : L->getBlocks()) {
+ for (Instruction &Inst : *BB) {
+ if (auto *S = dyn_cast<StoreInst>(&Inst)) {
+ if (S->getValueOperand()->getType()->isFloatTy())
+ Worklist.push_back(S);
+ }
+ }
+ }
+
+ // Traverse the floating point stores upwards searching, for floating point
+ // conversions.
+ SmallPtrSet<const Instruction *, 4> Visited;
+ SmallPtrSet<const Instruction *, 4> EmittedRemark;
+ while (!Worklist.empty()) {
+ auto *I = Worklist.pop_back_val();
+ if (!L->contains(I))
+ continue;
+ if (!Visited.insert(I).second)
+ continue;
+
+ // Emit a remark if the floating point store required a floating
+ // point conversion.
+ // TODO: More work could be done to identify the root cause such as a
+ // constant or a function return type and point the user to it.
+ if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
+ I->getDebugLoc(), L->getHeader())
+ << "floating point conversion changes vector width. "
+ << "Mixed floating point precision requires an up/down "
+ << "cast that will negatively impact performance.";
+ });
+
+ for (Use &Op : I->operands())
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ Worklist.push_back(OpI);
+ }
+}
+
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
!EnableLoopInterleaving),
@@ -9305,7 +10015,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
? "enabled"
: "?"))
<< " width=" << Hints.getWidth()
- << " unroll=" << Hints.getInterleave() << "\n");
+ << " interleave=" << Hints.getInterleave() << "\n");
// Function containing loop
Function *F = L->getHeader()->getParent();
@@ -9326,7 +10036,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
PredicatedScalarEvolution PSE(*SE, *L);
// Check if it is legal to vectorize the loop.
- LoopVectorizationRequirements Requirements(*ORE);
+ LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
&Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
@@ -9347,7 +10057,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
- ORE, BFI, PSI, Hints);
+ ORE, BFI, PSI, Hints, Requirements);
assert(L->isInnermost() && "Inner loop expected.");
@@ -9393,6 +10103,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
+ if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
+ ORE->emit([&]() {
+ auto *ExactFPMathInst = Requirements.getExactFPInst();
+ return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
+ ExactFPMathInst->getDebugLoc(),
+ ExactFPMathInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
+ "reorder floating-point operations\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
@@ -9409,9 +10134,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
CM.collectValuesToIgnore();
+ CM.collectElementTypesForWidening();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
+ Requirements, ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -9426,19 +10153,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+ IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
}
// Identify the diagnostic messages that should be produced.
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
- if (Requirements.doesNotMeet(F, L, Hints)) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
- "requirements.\n");
- Hints.emitRemarkWithHints();
- return false;
- }
-
if (VF.Width.isScalar()) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
@@ -9518,82 +10238,94 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
- LVP.setBestPlan(VF.Width, IC);
-
- using namespace ore;
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();
-
- if (!VectorizeLoop) {
- assert(IC > 1 && "interleave count should not be 1 or 0");
- // If we decided that it is not legal to vectorize the loop, then
- // interleave it.
- InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
- BFI, PSI);
- LVP.executePlan(Unroller, DT);
-
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
- } else {
- // If we decided that it is *legal* to vectorize the loop, then do it.
-
- // Consider vectorizing the epilogue too if it's profitable.
- VectorizationFactor EpilogueVF =
- CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
- if (EpilogueVF.Width.isVector()) {
-
- // The first pass vectorizes the main loop and creates a scalar epilogue
- // to be vectorized by executing the plan (potentially with a different
- // factor) again shortly afterwards.
- EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
- EpilogueVF.Width.getKnownMinValue(), 1);
- EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
- &LVL, &CM, BFI, PSI);
-
- LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
- LVP.executePlan(MainILV, DT);
- ++LoopsVectorized;
-
- simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
- formLCSSARecursively(*L, *DT, LI, SE);
-
- // Second pass vectorizes the epilogue and adjusts the control flow
- // edges from the first pass.
- LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
- EPI.MainLoopVF = EPI.EpilogueVF;
- EPI.MainLoopUF = EPI.EpilogueUF;
- EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
- ORE, EPI, &LVL, &CM, BFI, PSI);
- LVP.executePlan(EpilogILV, DT);
- ++LoopsEpilogueVectorized;
-
- if (!MainILV.areSafetyChecksAdded())
- DisableRuntimeUnroll = true;
+ {
+ // Optimistically generate runtime checks. Drop them if they turn out to not
+ // be profitable. Limit the scope of Checks, so the cleanup happens
+ // immediately after vector codegeneration is done.
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ F->getParent()->getDataLayout());
+ if (!VF.Width.isScalar() || IC > 1)
+ Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
+ LVP.setBestPlan(VF.Width, IC);
+
+ using namespace ore;
+ if (!VectorizeLoop) {
+ assert(IC > 1 && "interleave count should not be 1 or 0");
+ // If we decided that it is not legal to vectorize the loop, then
+ // interleave it.
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
+ &CM, BFI, PSI, Checks);
+ LVP.executePlan(Unroller, DT);
+
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM, BFI, PSI);
- LVP.executePlan(LB, DT);
- ++LoopsVectorized;
-
- // Add metadata to disable runtime unrolling a scalar loop when there are
- // no runtime checks about strides and memory. A scalar loop that is
- // rarely used is not worth unrolling.
- if (!LB.areSafetyChecksAdded())
- DisableRuntimeUnroll = true;
+ // If we decided that it is *legal* to vectorize the loop, then do it.
+
+ // Consider vectorizing the epilogue too if it's profitable.
+ VectorizationFactor EpilogueVF =
+ CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+ if (EpilogueVF.Width.isVector()) {
+
+ // The first pass vectorizes the main loop and creates a scalar epilogue
+ // to be vectorized by executing the plan (potentially with a different
+ // factor) again shortly afterwards.
+ EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+ EpilogueVF.Width.getKnownMinValue(),
+ 1);
+ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
+ EPI, &LVL, &CM, BFI, PSI, Checks);
+
+ LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+ LVP.executePlan(MainILV, DT);
+ ++LoopsVectorized;
+
+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ // Second pass vectorizes the epilogue and adjusts the control flow
+ // edges from the first pass.
+ LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+ EPI.MainLoopVF = EPI.EpilogueVF;
+ EPI.MainLoopUF = EPI.EpilogueUF;
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
+ ORE, EPI, &LVL, &CM, BFI, PSI,
+ Checks);
+ LVP.executePlan(EpilogILV, DT);
+ ++LoopsEpilogueVectorized;
+
+ if (!MainILV.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ } else {
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+ &LVL, &CM, BFI, PSI, Checks);
+ LVP.executePlan(LB, DT);
+ ++LoopsVectorized;
+
+ // Add metadata to disable runtime unrolling a scalar loop when there
+ // are no runtime checks about strides and memory. A scalar loop that is
+ // rarely used is not worth unrolling.
+ if (!LB.areSafetyChecksAdded())
+ DisableRuntimeUnroll = true;
+ }
+ // Report the vectorization decision.
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+ });
}
- // Report the vectorization decision.
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
- });
+ if (ORE->allowExtraAnalysis(LV_NAME))
+ checkMixedPrecision(L, ORE);
}
Optional<MDNode *> RemainderLoopID =
@@ -9719,8 +10451,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
}
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
if (!Result.MadeCFGChange)
PA.preserveSet<CFGAnalyses>();
return PA;