aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp947
1 files changed, 590 insertions, 357 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f24ae6b100d5..23bb6f0860c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -87,7 +87,6 @@
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -332,8 +331,8 @@ static cl::opt<bool>
cl::desc("Prefer in-loop vector reductions, "
"overriding the targets preference."));
-cl::opt<bool> EnableStrictReductions(
- "enable-strict-reductions", cl::init(false), cl::Hidden,
+static cl::opt<bool> ForceOrderedReductions(
+ "force-ordered-reductions", cl::init(false), cl::Hidden,
cl::desc("Enable the vectorisation of loops with in-order (strict) "
"FP reductions"));
@@ -545,7 +544,8 @@ public:
/// vectorized loop.
void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask);
+ VPValue *StoredValue, VPValue *BlockInMask,
+ bool ConsecutiveStride, bool Reverse);
/// Set the debug location in the builder \p Ptr using the debug location in
/// \p V. If \p Ptr is None then it uses the class member's Builder.
@@ -590,12 +590,11 @@ protected:
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
- /// Fix a first-order recurrence. This is the second phase of vectorizing
- /// this phi node.
+ /// Create the exit value of first order recurrences in the middle block and
+ /// update their users.
void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
- /// Fix a reduction cross-iteration phi. This is the second phase of
- /// vectorizing this phi node.
+ /// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
/// Clear NSW/NUW flags from reduction instructions if necessary.
@@ -621,9 +620,9 @@ protected:
/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
/// \p Opcode is relevant for FP induction variable.
- virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps Opcode =
- Instruction::BinaryOpsEnd);
+ virtual Value *
+ getStepVector(Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
@@ -890,9 +889,9 @@ public:
private:
Value *getBroadcastInstrs(Value *V) override;
- Value *getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps Opcode =
- Instruction::BinaryOpsEnd) override;
+ Value *getStepVector(
+ Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
};
@@ -911,10 +910,9 @@ struct EpilogueLoopVectorizationInfo {
Value *TripCount = nullptr;
Value *VectorTripCount = nullptr;
- EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
- unsigned EUF)
- : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
- EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+ EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
+ ElementCount EVF, unsigned EUF)
+ : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
assert(EUF == 1 &&
"A high UF for the epilogue loop is likely not beneficial.");
}
@@ -1105,11 +1103,10 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
}
/// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
- assert(isa<ConstantInt>(Step) && "Expected an integer step");
- Constant *StepVal = ConstantInt::get(
- Step->getType(),
- cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+ int64_t Step) {
+ assert(Ty->isIntegerTy() && "Expected an integer step");
+ Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
}
@@ -1121,6 +1118,13 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
return VF.isScalable() ? B.CreateVScale(EC) : EC;
}
+static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
+ assert(FTy->isFloatingPointTy() && "Expected floating point type!");
+ Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
+ Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
+ return B.CreateUIToFP(RuntimeVF, FTy);
+}
+
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
@@ -1319,8 +1323,7 @@ public:
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
- return EnableStrictReductions && !Hints->allowReordering() &&
- RdxDesc.isOrdered();
+ return !Hints->allowReordering() && RdxDesc.isOrdered();
}
/// \returns The smallest bitwidth each instruction can be represented with.
@@ -1495,14 +1498,14 @@ public:
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
- return Legal->isConsecutivePtr(Ptr) &&
+ return Legal->isConsecutivePtr(DataType, Ptr) &&
TTI.isLegalMaskedStore(DataType, Alignment);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
- return Legal->isConsecutivePtr(Ptr) &&
+ return Legal->isConsecutivePtr(DataType, Ptr) &&
TTI.isLegalMaskedLoad(DataType, Alignment);
}
@@ -1539,7 +1542,7 @@ public:
// through scalar predication or masked load/store or masked gather/scatter.
// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) {
- if (!blockNeedsPredication(I->getParent()))
+ if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
// Loads and stores that need some form of masked operation are predicated
// instructions.
@@ -1593,7 +1596,10 @@ public:
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
- bool blockNeedsPredication(BasicBlock *BB) const {
+ /// Returns true if the instructions in this block requires predication
+ /// for any reason, e.g. because tail folding now requires a predicate
+ /// or because the block in the original loop was predicated.
+ bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
@@ -1928,7 +1934,7 @@ class GeneratedRTChecks {
/// The value representing the result of the generated memory runtime checks.
/// If it is nullptr, either no memory runtime checks have been generated or
/// they have been used.
- Instruction *MemRuntimeCheckCond = nullptr;
+ Value *MemRuntimeCheckCond = nullptr;
DominatorTree *DT;
LoopInfo *LI;
@@ -1971,7 +1977,7 @@ public:
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
- std::tie(std::ignore, MemRuntimeCheckCond) =
+ MemRuntimeCheckCond =
addRuntimeChecks(MemCheckBlock->getTerminator(), L,
RtPtrChecking.getChecks(), MemCheckExp);
assert(MemRuntimeCheckCond &&
@@ -2030,7 +2036,6 @@ public:
if (MemCheckExp.isInsertedInstruction(&I))
continue;
SE.forgetValue(&I);
- SE.eraseValueFromMap(&I);
I.eraseFromParent();
}
}
@@ -2289,9 +2294,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Step = Builder.CreateTrunc(Step, TruncType);
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
+
+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
Value *SteppedStart =
- getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+ getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -2308,12 +2315,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
Type *StepType = Step->getType();
+ Value *RuntimeVF;
if (Step->getType()->isFloatingPointTy())
- StepType = IntegerType::get(StepType->getContext(),
- StepType->getScalarSizeInBits());
- Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
- if (Step->getType()->isFloatingPointTy())
- RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
+ else
+ RuntimeVF = getRuntimeVF(Builder, StepType, VF);
Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
// Create a vector splat to use in the induction update.
@@ -2388,9 +2394,13 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
if (isa<TruncInst>(EntryVal))
return;
- const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
- if (Casts.empty())
+ if (!CastDef) {
+ assert(ID.getCastInsts().empty() &&
+ "there are casts for ID, but no CastDef");
return;
+ }
+ assert(!ID.getCastInsts().empty() &&
+ "there is a CastDef, but no casts for ID");
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
@@ -2462,9 +2472,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
+ Value *StartIdx;
+ if (Step->getType()->isFloatingPointTy())
+ StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
+ else
+ StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
+
Value *EntryPart =
- getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
- ID.getInductionOpcode());
+ getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
@@ -2520,7 +2535,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
}
-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
+ Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
auto *ValVTy = cast<VectorType>(Val->getType());
@@ -2543,12 +2559,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
}
Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
- // Add on StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(
- VLen, ConstantInt::get(InitVecValSTy, StartIdx));
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+ // Splat the StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
if (STy->isIntegerTy()) {
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
@@ -2561,6 +2576,8 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
Step = Builder.CreateVectorSplat(VLen, Step);
Value *MulOp = Builder.CreateFMul(InitVec, Step);
return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
@@ -2609,8 +2626,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
}
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx0 =
- createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
if (!IsUniform && VF.isScalable()) {
auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
@@ -2838,12 +2854,25 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
+ MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+ assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
+ "masked interleaved groups are not allowed.");
+ assert((!MaskForGaps || !VF.isScalable()) &&
+ "masking gaps for scalable vectors is not yet supported.");
for (unsigned Part = 0; Part < UF; Part++) {
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
for (unsigned i = 0; i < InterleaveFactor; i++) {
- // Interleaved store group doesn't allow a gap, so each index has a member
- assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
+ assert((Group->getMember(i) || MaskForGaps) &&
+ "Fail to get a member from an interleaved store group");
+ Instruction *Member = Group->getMember(i);
+
+ // Skip the gaps in the group.
+ if (!Member) {
+ Value *Undef = PoisonValue::get(SubVT);
+ StoredVecs.push_back(Undef);
+ continue;
+ }
Value *StoredVec = State.get(StoredValues[i], Part);
@@ -2867,16 +2896,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
"interleaved.vec");
Instruction *NewStoreInstr;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- NewStoreInstr = Builder.CreateMaskedStore(
- IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
- }
- else
+ if (BlockInMask || MaskForGaps) {
+ Value *GroupMask = MaskForGaps;
+ if (BlockInMask) {
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+ "interleaved.mask");
+ GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
+ ShuffledMask, MaskForGaps)
+ : ShuffledMask;
+ }
+ NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
+ Group->getAlign(), GroupMask);
+ } else
NewStoreInstr =
Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
@@ -2886,7 +2920,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
void InnerLoopVectorizer::vectorizeMemoryInstruction(
Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask) {
+ VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
+ bool Reverse) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2895,31 +2930,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
assert((!SI || StoredValue) && "No stored value provided for widened store");
assert((!LI || !StoredValue) && "Stored value provided for widened load");
- LoopVectorizationCostModel::InstWidening Decision =
- Cost->getWideningDecision(Instr, VF);
- assert((Decision == LoopVectorizationCostModel::CM_Widen ||
- Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
- Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
- "CM decision is not to widen the memory instruction");
-
Type *ScalarDataTy = getLoadStoreType(Instr);
auto *DataTy = VectorType::get(ScalarDataTy, VF);
const Align Alignment = getLoadStoreAlignment(Instr);
-
- // Determine if the pointer operand of the access is either consecutive or
- // reverse consecutive.
- bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
- bool ConsecutiveStride =
- Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
- bool CreateGatherScatter =
- (Decision == LoopVectorizationCostModel::CM_GatherScatter);
-
- // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
- // gather/scatter. Otherwise Decision should have been to Scalarize.
- assert((ConsecutiveStride || CreateGatherScatter) &&
- "The instruction should be scalarized");
- (void)ConsecutiveStride;
+ bool CreateGatherScatter = !ConsecutiveStride;
VectorParts BlockInMaskParts(UF);
bool isMaskRequired = BlockInMask;
@@ -2953,7 +2968,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
} else {
- Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+ Value *Increment =
+ createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
PartPtr = cast<GetElementPtrInst>(
Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
PartPtr->setIsInBounds(InBounds);
@@ -3172,7 +3188,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
- Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
+ Value *Step = createStepForVF(Builder, Ty, VF, UF);
// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
@@ -3262,8 +3278,7 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
if (!Cost->foldTailByMasking()) {
- Value *Step =
- createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+ Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
}
// Create new preheader for vector loop.
@@ -3433,7 +3448,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
assert(isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction");
return B.CreateGEP(
- StartValue->getType()->getPointerElementType(), StartValue,
+ ID.getElementType(), StartValue,
CreateMul(Index,
Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
GetInsertPoint())));
@@ -3739,7 +3754,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// The loop step is equal to the vectorization factor (num of SIMD elements)
// times the unroll factor (num of SIMD instructions).
Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
- Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
+ Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
Induction =
createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -3857,21 +3872,19 @@ struct CSEDenseMapInfo {
static void cse(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *In = &*I++;
-
- if (!CSEDenseMapInfo::canHandle(In))
+ for (Instruction &In : llvm::make_early_inc_range(*BB)) {
+ if (!CSEDenseMapInfo::canHandle(&In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
- if (Instruction *V = CSEMap.lookup(In)) {
- In->replaceAllUsesWith(V);
- In->eraseFromParent();
+ if (Instruction *V = CSEMap.lookup(&In)) {
+ In.replaceAllUsesWith(V);
+ In.eraseFromParent();
continue;
}
- CSEMap[In] = In;
+ CSEMap[&In] = &In;
}
}
@@ -3881,7 +3894,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
- for (auto &ArgOp : CI->arg_operands())
+ for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());
// Estimate cost of scalarized vector call. The source operands are assumed
@@ -3940,7 +3953,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
- SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+ SmallVector<const Value *> Arguments(CI->args());
FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
SmallVector<Type *> ParamTys;
std::transform(FTy->param_begin(), FTy->param_end(),
@@ -3974,7 +3987,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
// If the value wasn't vectorized, we must maintain the original scalar
// type. The absence of the value from State indicates that it
// wasn't vectorized.
- VPValue *Def = State.Plan->getVPValue(KV.first);
+ // FIXME: Should not rely on getVPValue at this point.
+ VPValue *Def = State.Plan->getVPValue(KV.first, true);
if (!State.hasAnyVectorValue(Def))
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
@@ -4081,7 +4095,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
// If the value wasn't vectorized, we must maintain the original scalar
// type. The absence of the value from State indicates that it
// wasn't vectorized.
- VPValue *Def = State.Plan->getVPValue(KV.first);
+ // FIXME: Should not rely on getVPValue at this point.
+ VPValue *Def = State.Plan->getVPValue(KV.first, true);
if (!State.hasAnyVectorValue(Def))
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
@@ -4222,17 +4237,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
// After execution completes the vector loop, we extract the next value of
// the recurrence (x) to use as the initial value in the scalar loop.
- auto *IdxTy = Builder.getInt32Ty();
- auto *VecPhi = cast<PHINode>(State.get(PhiR, 0));
-
- // Fix the latch value of the new recurrence in the vector loop.
- VPValue *PreviousDef = PhiR->getBackedgeValue();
- Value *Incoming = State.get(PreviousDef, UF - 1);
- VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
-
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
+ VPValue *PreviousDef = PhiR->getBackedgeValue();
+ Value *Incoming = State.get(PreviousDef, UF - 1);
auto *ExtractForScalar = Incoming;
+ auto *IdxTy = Builder.getInt32Ty();
if (VF.isVector()) {
auto *One = ConstantInt::get(IdxTy, 1);
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4283,8 +4293,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
// and thus no phis which needed updated.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (any_of(LCSSAPhi.incoming_values(),
- [Phi](Value *V) { return V == Phi; }))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
}
@@ -4301,29 +4310,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
setDebugLocFromInst(ReductionStartValue);
- VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
+ VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Wrap flags are in general invalid after vectorization, clear them.
clearReductionWrapFlags(RdxDesc, State);
- // Fix the vector-loop phi.
-
- // Reductions do not have to start at zero. They can start with
- // any loop invariant values.
- BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
-
- unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF;
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
- Value *Val = State.get(PhiR->getBackedgeValue(), Part);
- if (PhiR->isOrdered())
- Val = State.get(PhiR->getBackedgeValue(), UF - 1);
-
- cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
- }
-
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
@@ -4361,7 +4354,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
RdxDesc.getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags())) {
auto *VecRdxPhi =
- cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
+ cast<PHINode>(State.get(PhiR, Part));
VecRdxPhi->setIncomingValueForBlock(
LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
}
@@ -4382,13 +4375,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
: Builder.CreateZExt(Trunc, VecTy);
- for (Value::user_iterator UI = RdxParts[Part]->user_begin();
- UI != RdxParts[Part]->user_end();)
- if (*UI != Trunc) {
- (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+ for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
+ if (U != Trunc) {
+ U->replaceUsesOfWith(RdxParts[Part], Extnd);
RdxParts[Part] = Extnd;
- } else {
- ++UI;
}
}
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
@@ -4421,9 +4411,11 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
ReducedPartRdx = Builder.CreateBinOp(
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- } else {
+ } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
+ ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
+ else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- }
}
}
@@ -4431,7 +4423,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// target reduction in the loop using a Reduction recipe.
if (VF.isVector() && !PhiR->isInLoop()) {
ReducedPartRdx =
- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (PhiTy != RdxDesc.getRecurrenceType())
@@ -4456,8 +4448,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// fixFirstOrderRecurrence for a more complete explaination of the logic.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (any_of(LCSSAPhi.incoming_values(),
- [LoopExitInst](Value *V) { return V == LoopExitInst; }))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
// Fix the scalar loop reduction variable with the incoming reduction sum
@@ -4488,7 +4479,8 @@ void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &Rd
Instruction *Cur = Worklist.pop_back_val();
if (isa<OverflowingBinaryOperator>(Cur))
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *V = State.get(State.Plan->getVPValue(Cur), Part);
+ // FIXME: Should not rely on getVPValue at this point.
+ Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
cast<Instruction>(V)->dropPoisonGeneratingFlags();
}
@@ -4519,11 +4511,12 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
// Can be a loop invariant incoming value or the last scalar value to be
// extracted from the vectorized loop.
+ // FIXME: Should not rely on getVPValue at this point.
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Value *lastIncomingValue =
OrigLoop->isLoopInvariant(IncomingValue)
? IncomingValue
- : State.get(State.Plan->getVPValue(IncomingValue),
+ : State.get(State.Plan->getVPValue(IncomingValue, true),
VPIteration(UF - 1, Lane));
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
@@ -4763,10 +4756,18 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
}
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *PartStart = createStepForVF(
- Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
+ Value *PartStart =
+ createStepForVF(Builder, PtrInd->getType(), VF, Part);
if (NeedsVectorIndex) {
+ // Here we cache the whole vector, which means we can support the
+ // extraction of any lane. However, in some cases the extractelement
+ // instruction that is generated for scalar uses of this vector (e.g.
+ // a load instruction) is not folded away. Therefore we still
+ // calculate values for the first n lanes to avoid redundant moves
+ // (when extracting the 0th element) and to produce scalar code (i.e.
+ // additional add/gep instructions instead of expensive extractelement
+ // instructions) when extracting higher-order elements.
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
@@ -4774,9 +4775,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
- // We've cached the whole vector, which means we can support the
- // extraction of any lane.
- continue;
}
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
@@ -4813,7 +4811,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Value *NumUnrolledElems =
Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
Value *InductionGEP = GetElementPtrInst::Create(
- ScStValueType->getPointerElementType(), NewPointerPhi,
+ II.getElementType(), NewPointerPhi,
Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
InductionLoc);
NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
@@ -4832,7 +4830,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
Value *GEP = Builder.CreateGEP(
- ScStValueType->getPointerElementType(), NewPointerPhi,
+ II.getElementType(), NewPointerPhi,
Builder.CreateMul(
StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
"vector.gep"));
@@ -4979,7 +4977,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
auto *CI = cast<CallInst>(&I);
SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI->arg_operands())
+ for (Value *ArgOperand : CI->args())
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
@@ -5128,8 +5126,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Instruction *Update = cast<Instruction>(
cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
- ScalarPtrs.insert(Update);
- return;
+
+ // If there is more than one user of Update (Ptr), we shouldn't assume it
+ // will be scalar after vectorisation as other users of the instruction
+ // may require widening. Otherwise, add it to ScalarPtrs.
+ if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
+ ScalarPtrs.insert(Update);
+ return;
+ }
}
// We only care about bitcast and getelementptr instructions contained in
// the loop.
@@ -5142,12 +5146,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Worklist.count(I))
return;
- // If all users of the pointer will be memory accesses and scalar, place the
- // pointer in ScalarPtrs. Otherwise, place the pointer in
- // PossibleNonScalarPtrs.
- if (llvm::all_of(I->users(), [&](User *U) {
- return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
- isScalarUse(cast<Instruction>(U), Ptr);
+ // If the use of the pointer will be a scalar use, and all users of the
+ // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+ // place the pointer in PossibleNonScalarPtrs.
+ if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+ return isa<LoadInst>(U) || isa<StoreInst>(U);
}))
ScalarPtrs.insert(I);
else
@@ -5254,7 +5257,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
}
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
- if (!blockNeedsPredication(I->getParent()))
+ if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
switch(I->getOpcode()) {
default:
@@ -5297,12 +5300,20 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
- // needs predication, or it was decided to use masking to deal with gaps.
+ // needs predication, or it was decided to use masking to deal with gaps
+ // (either a gap at the end of a load-access that may result in a speculative
+ // load, or any gaps in a store-access).
bool PredicatedAccessRequiresMasking =
- Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
- bool AccessWithGapsRequiresMasking =
- Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
- if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+ blockNeedsPredicationForAnyReason(I->getParent()) &&
+ Legal->isMaskRequired(I);
+ bool LoadAccessWithGapsRequiresEpilogMasking =
+ isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
+ !isScalarEpilogueAllowed();
+ bool StoreAccessWithGapsRequiresMasking =
+ isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
+ if (!PredicatedAccessRequiresMasking &&
+ !LoadAccessWithGapsRequiresEpilogMasking &&
+ !StoreAccessWithGapsRequiresMasking)
return true;
// If masked interleaving is required, we expect that the user/target had
@@ -5311,6 +5322,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
assert(useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled.");
+ if (Group->isReverse())
+ return false;
+
auto *Ty = getLoadStoreType(I);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
@@ -5320,14 +5334,13 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
Instruction *I, ElementCount VF) {
// Get and ensure we have a valid memory instruction.
- LoadInst *LI = dyn_cast<LoadInst>(I);
- StoreInst *SI = dyn_cast<StoreInst>(I);
- assert((LI || SI) && "Invalid memory instruction");
+ assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
auto *Ptr = getLoadStorePointerOperand(I);
+ auto *ScalarTy = getLoadStoreType(I);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!Legal->isConsecutivePtr(Ptr))
+ if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
return false;
// If the instruction is a store located in a predicated block, it will be
@@ -5338,7 +5351,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
- auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
if (hasIrregularType(ScalarTy, DL))
return false;
@@ -5369,12 +5381,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
return (!I || !TheLoop->contains(I));
};
+ // Worklist containing uniform instructions demanding lane 0.
SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
- // Instructions that are scalar with predication must not be considered
- // uniform after vectorization, because that would create an erroneous
- // replicating region where only a single instance out of VF should be formed.
+ // Add uniform instructions demanding lane 0 to the worklist. Instructions
+ // that are scalar with predication must not be considered uniform after
+ // vectorization, because that would create an erroneous replicating region
+ // where only a single instance out of VF should be formed.
// TODO: optimize such seldom cases if found important, see PR40816.
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
if (isOutOfScope(I)) {
@@ -5433,6 +5447,30 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sideeffect:
+ case Intrinsic::experimental_noalias_scope_decl:
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ if (TheLoop->hasLoopInvariantOperands(&I))
+ addToWorklistIfAllowed(&I);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // ExtractValue instructions must be uniform, because the operands are
+ // known to be loop-invariant.
+ if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
+ assert(isOutOfScope(EVI->getAggregateOperand()) &&
+ "Expected aggregate value to be loop invariant");
+ addToWorklistIfAllowed(EVI);
+ continue;
+ }
+
// If there's no pointer operand, there's nothing to do.
auto *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
@@ -5565,13 +5603,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
ElementCount
LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
- if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
- reportVectorizationInfo(
- "Disabling scalable vectorization, because target does not "
- "support scalable vectors.",
- "ScalableVectorsUnsupported", ORE, TheLoop);
+ if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
return ElementCount::getScalable(0);
- }
if (Hints->isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
@@ -5579,6 +5612,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return ElementCount::getScalable(0);
}
+ LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
+
auto MaxScalableVF = ElementCount::getScalable(
std::numeric_limits<ElementCount::ScalarTy>::max());
@@ -5614,6 +5649,13 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
// Limit MaxScalableVF by the maximum safe dependence distance.
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+ if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+ unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
+ .getVScaleRangeArgs()
+ .second;
+ if (VScaleMax > 0)
+ MaxVScale = VScaleMax;
+ }
MaxScalableVF = ElementCount::getScalable(
MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
if (!MaxScalableVF)
@@ -5681,17 +5723,32 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
return MaxSafeFixedVF;
}
- LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
- << " is unsafe. Ignoring scalable UserVF.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "User-specified vectorization factor "
- << ore::NV("UserVectorizationFactor", UserVF)
- << " is unsafe. Ignoring the hint to let the compiler pick a "
- "suitable VF.";
- });
+ if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is ignored because scalable vectors are not "
+ "available.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is ignored because the target does not support scalable "
+ "vectors. The compiler will pick a more suitable value.";
+ });
+ } else {
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe. Ignoring scalable UserVF.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe. Ignoring the hint to let the compiler pick a "
+ "more suitable value.";
+ });
+ }
}
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
@@ -5972,19 +6029,27 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return RTCostA < RTCostB;
}
- // When set to preferred, for now assume vscale may be larger than 1, so
- // that scalable vectorization is slightly favorable over fixed-width
- // vectorization.
+ // Improve estimate for the vector width if it is scalable.
+ unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+ unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+ if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+ if (A.Width.isScalable())
+ EstimatedWidthA *= VScale.getValue();
+ if (B.Width.isScalable())
+ EstimatedWidthB *= VScale.getValue();
+ }
+
+ // When set to preferred, for now assume vscale may be larger than 1 (or the
+ // one being tuned for), so that scalable vectorization is slightly favorable
+ // over fixed-width vectorization.
if (Hints->isScalableVectorizationPreferred())
if (A.Width.isScalable() && !B.Width.isScalable())
- return (CostA * B.Width.getKnownMinValue()) <=
- (CostB * A.Width.getKnownMinValue());
+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
// <=> (CostA * B.Width) < (CostB * A.Width)
- return (CostA * B.Width.getKnownMinValue()) <
- (CostB * A.Width.getKnownMinValue());
+ return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
}
VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
@@ -6014,11 +6079,22 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
VectorizationCostTy C = expectedCost(i, &InvalidCosts);
VectorizationFactor Candidate(i, C.first);
- LLVM_DEBUG(
- dbgs() << "LV: Vector loop of width " << i << " costs: "
- << (Candidate.Cost / Candidate.Width.getKnownMinValue())
- << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
- << ".\n");
+
+#ifndef NDEBUG
+ unsigned AssumedMinimumVscale = 1;
+ if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
+ AssumedMinimumVscale = VScale.getValue();
+ unsigned Width =
+ Candidate.Width.isScalable()
+ ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+ : Candidate.Width.getFixedValue();
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (Candidate.Cost / Width));
+ if (i.isScalable())
+ LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+ << AssumedMinimumVscale << ")");
+ LLVM_DEBUG(dbgs() << ".\n");
+#endif
if (!C.second && !ForceVectorization) {
LLVM_DEBUG(
@@ -6182,15 +6258,6 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
- // FIXME: This can be fixed for scalable vectors later, because at this stage
- // the LoopVectorizer will only consider vectorizing a loop with scalable
- // vectors when the loop has a hint to enable vectorization for a given VF.
- if (MainLoopVF.isScalable()) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
- "yet supported.\n");
- return Result;
- }
-
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
@@ -6202,9 +6269,9 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
if (EpilogueVectorizationForceVF > 1) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
- if (LVP.hasPlanWithVFs(
- {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
- return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+ ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
+ if (LVP.hasPlanWithVF(ForcedEC))
+ return {ForcedEC, 0};
else {
LLVM_DEBUG(
dbgs()
@@ -6221,14 +6288,24 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
- if (!isEpilogueVectorizationProfitable(MainLoopVF))
+ auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+ if (MainLoopVF.isScalable())
+ LLVM_DEBUG(
+ dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
+ "yet supported. Converting to fixed-width (VF="
+ << FixedMainLoopVF << ") instead\n");
+
+ if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
+ "this loop\n");
return Result;
+ }
for (auto &NextVF : ProfitableVFs)
- if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
(Result.Width.getFixedValue() == 1 ||
isMoreProfitable(NextVF, Result)) &&
- LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+ LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
if (Result != VectorizationFactor::Disabled())
@@ -6471,6 +6548,22 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
unsigned StoresIC = IC / (NumStores ? NumStores : 1);
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+ // There is little point in interleaving for reductions containing selects
+ // and compares when VF=1 since it may just create more overhead than it's
+ // worth for loops with small trip counts. This is because we still have to
+ // do the final reduction after the loop.
+ bool HasSelectCmpReductions =
+ HasReductions &&
+ any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+ RdxDesc.getRecurrenceKind());
+ });
+ if (HasSelectCmpReductions) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
+ return 1;
+ }
+
// If we have a scalar reduction (vector reductions are already dealt with
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. For tree-wise reductions
@@ -6756,7 +6849,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {
- if (!blockNeedsPredication(BB))
+ if (!blockNeedsPredicationForAnyReason(BB))
continue;
for (Instruction &I : *BB)
if (isScalarWithPredication(&I)) {
@@ -6851,7 +6944,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
- APInt::getAllOnesValue(VF.getFixedValue()), true, false);
+ APInt::getAllOnes(VF.getFixedValue()), true, false);
ScalarCost +=
VF.getFixedValue() *
TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
@@ -6870,7 +6963,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
else if (needsExtract(J, VF)) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(J->getType(), VF)),
- APInt::getAllOnesValue(VF.getFixedValue()), false, true);
+ APInt::getAllOnes(VF.getFixedValue()), false, true);
}
}
@@ -7016,7 +7109,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
auto *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
Cost += TTI.getScalarizationOverhead(
- Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+ Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
/*Insert=*/false, /*Extract=*/true);
Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
@@ -7036,7 +7129,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -7117,18 +7210,16 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
unsigned InterleaveFactor = Group->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
- // Holds the indices of existing members in an interleaved load group.
- // An interleaved store group doesn't need this as it doesn't allow gaps.
+ // Holds the indices of existing members in the interleaved group.
SmallVector<unsigned, 4> Indices;
- if (isa<LoadInst>(I)) {
- for (unsigned i = 0; i < InterleaveFactor; i++)
- if (Group->getMember(i))
- Indices.push_back(i);
- }
+ for (unsigned IF = 0; IF < InterleaveFactor; IF++)
+ if (Group->getMember(IF))
+ Indices.push_back(IF);
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
- Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+ (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
+ (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -7210,8 +7301,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
Instruction *Op0, *Op1;
- if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
- !TheLoop->isLoopInvariant(RedOp)) {
+ if (RedOp &&
+ match(RedOp,
+ m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
+ match(Op0, m_ZExtOrSExt(m_Value())) &&
+ Op0->getOpcode() == Op1->getOpcode() &&
+ Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+ !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
+ (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
+
+ // Matched reduce(ext(mul(ext(A), ext(B)))
+ // Note that the extend opcodes need to all match, or if A==B they will have
+ // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
+ // which is equally fine.
+ bool IsUnsigned = isa<ZExtInst>(Op0);
+ auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
+ auto *MulType = VectorType::get(Op0->getType(), VectorTy);
+
+ InstructionCost ExtCost =
+ TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
+ TTI::CastContextHint::None, CostKind, Op0);
+ InstructionCost MulCost =
+ TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
+ InstructionCost Ext2Cost =
+ TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
+ TTI::CastContextHint::None, CostKind, RedOp);
+
+ InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+ /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ CostKind);
+
+ if (RedCost.isValid() &&
+ RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
+ return I == RetI ? RedCost : 0;
+ } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
+ !TheLoop->isLoopInvariant(RedOp)) {
// Matched reduce(ext(A))
bool IsUnsigned = isa<ZExtInst>(RedOp);
auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
@@ -7245,7 +7369,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
return I == RetI ? RedCost : 0;
- } else {
+ } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
// Matched reduce(mul())
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
@@ -7304,9 +7428,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Type *VectorTy;
InstructionCost C = getInstructionCost(I, VF, VectorTy);
- bool TypeNotScalarized =
- VF.isVector() && VectorTy->isVectorTy() &&
- TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
+ bool TypeNotScalarized = false;
+ if (VF.isVector() && VectorTy->isVectorTy()) {
+ unsigned NumParts = TTI.getNumberOfParts(VectorTy);
+ if (NumParts)
+ TypeNotScalarized = NumParts < VF.getKnownMinValue();
+ else
+ C = InstructionCost::getInvalid();
+ }
return VectorizationCostTy(C, TypeNotScalarized);
}
@@ -7327,8 +7456,8 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
- cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
- true, false);
+ cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
+ false);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -7340,7 +7469,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
// Collect operands to consider.
CallInst *CI = dyn_cast<CallInst>(I);
- Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+ Instruction::op_range Ops = CI ? CI->args() : I->operands();
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
@@ -7391,8 +7520,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {
InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
- int ConsecutiveStride =
- Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+ int ConsecutiveStride = Legal->isConsecutivePtr(
+ getLoadStoreType(&I), getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.");
InstWidening Decision =
@@ -7579,8 +7708,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (
TTI.getScalarizationOverhead(
- Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false,
- true) +
+ Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
(TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
@@ -7893,7 +8021,7 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
if (auto *Ptr = getLoadStorePointerOperand(Inst))
- return Legal->isConsecutivePtr(Ptr);
+ return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
return false;
}
@@ -8019,7 +8147,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return None;
// Invalidate interleave groups if all blocks of loop will be predicated.
- if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+ if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
!useMaskedInterleavedAccesses(*TTI)) {
LLVM_DEBUG(
dbgs()
@@ -8105,28 +8233,30 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return SelectedVF;
}
-void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
- LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
- << '\n');
- BestVF = VF;
- BestUF = UF;
+VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
+ assert(count_if(VPlans,
+ [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
+ 1 &&
+ "Best VF has not a single VPlan.");
- erase_if(VPlans, [VF](const VPlanPtr &Plan) {
- return !Plan->hasVF(VF);
- });
- assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+ for (const VPlanPtr &Plan : VPlans) {
+ if (Plan->hasVF(VF))
+ return *Plan.get();
+ }
+ llvm_unreachable("No plan found!");
}
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
+ VPlan &BestVPlan,
+ InnerLoopVectorizer &ILV,
DominatorTree *DT) {
+ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
+ << '\n');
+
// Perform the actual loop transformation.
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
- assert(BestVF.hasValue() && "Vectorization Factor is missing");
- assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
-
- VPTransformState State{
- *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
+ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
@@ -8142,7 +8272,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- VPlans.front()->execute(&State);
+ BestVPlan.execute(&State);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -8222,21 +8352,19 @@ Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
+ Value *Step,
Instruction::BinaryOps BinOp) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type *Ty = Val->getType();
assert(!Ty->isVectorTy() && "Val must be a scalar");
if (Ty->isFloatingPointTy()) {
- Constant *C = ConstantFP::get(Ty, (double)StartIdx);
-
// Floating-point operations inherit FMF via the builder's flags.
- Value *MulOp = Builder.CreateFMul(C, Step);
+ Value *MulOp = Builder.CreateFMul(StartIdx, Step);
return Builder.CreateBinOp(BinOp, Val, MulOp);
}
- Constant *C = ConstantInt::get(Ty, StartIdx);
- return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+ return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
}
static void AddRuntimeUnrollDisableMetaData(Loop *L) {
@@ -8311,7 +8439,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
OldInduction = Legal->getPrimaryInduction();
Type *IdxTy = Legal->getWidestInductionType();
Value *StartIdx = ConstantInt::get(IdxTy, 0);
- Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+
+ IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
+ Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
EPI.VectorTripCount = CountRoundDown;
Induction =
@@ -8329,9 +8459,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
LLVM_DEBUG({
dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
- << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+ << "Main Loop VF:" << EPI.MainLoopVF
<< ", Main Loop UF:" << EPI.MainLoopUF
- << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << ", Epilogue Loop VF:" << EPI.EpilogueVF
<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
});
}
@@ -8346,8 +8476,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
assert(L && "Expected valid Loop.");
assert(Bypass && "Expected valid bypass basic block.");
- unsigned VFactor =
- ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+ ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
Value *Count = getOrCreateTripCount(L);
// Reuse existing vector loop preheader for TC checks.
@@ -8361,7 +8490,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
- P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+ P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
"min.iters.check");
if (!ForEpilogue)
@@ -8513,11 +8642,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
- Value *CheckMinIters = Builder.CreateICmp(
- P, Count,
- ConstantInt::get(Count->getType(),
- EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
- "min.epilog.iters.check");
+ Value *CheckMinIters =
+ Builder.CreateICmp(P, Count,
+ createStepForVF(Builder, Count->getType(),
+ EPI.EpilogueVF, EPI.EpilogueUF),
+ "min.epilog.iters.check");
ReplaceInstWithInst(
Insert->getTerminator(),
@@ -8530,7 +8659,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
LLVM_DEBUG({
dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
- << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+ << "Epilogue Loop VF:" << EPI.EpilogueVF
<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
});
}
@@ -8628,7 +8757,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
VPValue *BlockMask = nullptr;
if (OrigLoop->getHeader() == BB) {
- if (!CM.blockNeedsPredication(BB))
+ if (!CM.blockNeedsPredicationForAnyReason(BB))
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
// Create the block in mask as the first non-phi instruction in the block.
@@ -8643,9 +8772,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
if (Legal->getPrimaryInduction())
IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
else {
- auto IVRecipe = new VPWidenCanonicalIVRecipe();
+ auto *IVRecipe = new VPWidenCanonicalIVRecipe();
Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
- IV = IVRecipe->getVPSingleValue();
+ IV = IVRecipe;
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
@@ -8708,12 +8837,21 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
if (Legal->isMaskRequired(I))
Mask = createBlockInMask(I->getParent(), Plan);
+ // Determine if the pointer operand of the access is either consecutive or
+ // reverse consecutive.
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, Range.Start);
+ bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+ bool Consecutive =
+ Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
+ return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
+ Consecutive, Reverse);
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
- Mask);
+ Mask, Consecutive, Reverse);
}
VPWidenIntOrFpInductionRecipe *
@@ -8829,7 +8967,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return nullptr;
- ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
+ ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
}
@@ -8916,6 +9054,37 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+ // Even if the instruction is not marked as uniform, there are certain
+ // intrinsic calls that can be effectively treated as such, so we check for
+ // them here. Conservatively, we only do this for scalable vectors, since
+ // for fixed-width VFs we can always fall back on full scalarization.
+ if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
+ switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ // For scalable vectors if one of the operands is variant then we still
+ // want to mark as uniform, which will generate one instruction for just
+ // the first lane of the vector. We can't scalarize the call in the same
+ // way as for fixed-width vectors because we don't know how many lanes
+ // there are.
+ //
+ // The reasons for doing it this way for scalable vectors are:
+ // 1. For the assume intrinsic generating the instruction for the first
+ // lane is still be better than not generating any at all. For
+ // example, the input may be a splat across all lanes.
+ // 2. For the lifetime start/end intrinsics the pointer operand only
+ // does anything useful when the input comes from a stack object,
+ // which suggests it should always be uniform. For non-stack objects
+ // the effect is to poison the object, which still allows us to
+ // remove the call.
+ IsUniform = true;
+ break;
+ default:
+ break;
+ }
+ }
+
auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
IsUniform, IsPredicated);
setRecipe(I, Recipe);
@@ -9137,6 +9306,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.recordRecipeOf(R);
// For min/max reducitons, where we have a pair of icmp/select, we also
// need to record the ICmp recipe, so it can be removed later.
+ assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+ "Only min/max recurrences allowed for inloop reductions");
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
}
@@ -9165,22 +9336,27 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
auto Plan = std::make_unique<VPlan>();
- VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
- Plan->setEntry(VPBB);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
+ VPBasicBlock *VPBB = nullptr;
+ VPBasicBlock *HeaderVPBB = nullptr;
+ SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+ if (VPBB)
+ VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+ else {
+ Plan->setEntry(FirstVPBBForBB);
+ HeaderVPBB = FirstVPBBForBB;
+ }
VPBB = FirstVPBBForBB;
Builder.setInsertPoint(VPBB);
@@ -9222,6 +9398,17 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Plan->addVPValue(UV, Def);
}
+ if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
+ HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
+ // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
+ // of the header block. That can happen for truncates of induction
+ // variables. Those recipes are moved to the phi section of the header
+ // block after applying SinkAfter, which relies on the original
+ // position of the trunc.
+ assert(isa<TruncInst>(Instr));
+ InductionsToMove.push_back(
+ cast<VPWidenIntOrFpInductionRecipe>(Recipe));
+ }
RecipeBuilder.setRecipe(Instr, Recipe);
VPBB->appendRecipe(Recipe);
continue;
@@ -9239,17 +9426,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ assert(isa<VPBasicBlock>(Plan->getEntry()) &&
+ !Plan->getEntry()->getEntryBasicBlock()->empty() &&
+ "entry block must be set to a non-empty VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
- // may also be empty, such as the last one VPBB, reflecting original
- // basic-blocks with no recipes.
- VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
- assert(PreEntry->empty() && "Expecting empty pre-entry block.");
- VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
- VPBlockUtils::disconnectBlocks(PreEntry, Entry);
- delete PreEntry;
-
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
@@ -9318,6 +9499,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ // Now that sink-after is done, move induction recipes for optimized truncates
+ // to the phi section of the header block.
+ for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
+ Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+
+ // Adjust the recipes for any inloop reductions.
+ adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+
// Introduce a recipe to combine the incoming and previous values of a
// first-order recurrence.
for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
@@ -9325,16 +9514,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (!RecurPhi)
continue;
+ VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
+ VPBasicBlock *InsertBlock = PrevRecipe->getParent();
+ auto *Region = GetReplicateRegion(PrevRecipe);
+ if (Region)
+ InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
+ if (Region || PrevRecipe->isPhi())
+ Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
+ else
+ Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
+
auto *RecurSplice = cast<VPInstruction>(
Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
{RecurPhi, RecurPhi->getBackedgeValue()}));
- VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
- if (auto *Region = GetReplicateRegion(PrevRecipe)) {
- VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
- RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
- } else
- RecurSplice->moveAfter(PrevRecipe);
RecurPhi->replaceAllUsesWith(RecurSplice);
// Set the first operand of RecurSplice to RecurPhi again, after replacing
// all users.
@@ -9372,22 +9565,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- // Adjust the recipes for any inloop reductions.
- adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start);
-
- // Finally, if tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the end of the latch.
- if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
- Builder.setInsertPoint(VPBB);
- auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
- for (auto &Reduction : Legal->getReductionVars()) {
- if (CM.isInLoopReduction(Reduction.first))
- continue;
- VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
- VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
- Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
- }
- }
+ // From this point onwards, VPlan-to-VPlan transformations may change the plan
+ // in ways that accessing values using original IR values is incorrect.
+ Plan->disableValue2VPValue();
VPlanTransforms::sinkScalarOperands(*Plan);
VPlanTransforms::mergeReplicateRegions(*Plan);
@@ -9405,6 +9585,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RSO.flush();
Plan->setName(PlanName);
+ assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
@@ -9443,12 +9624,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
return Plan;
}
-// Adjust the recipes for any inloop reductions. The chain of instructions
-// leading from the loop exit instr to the phi need to be converted to
-// reductions, with one operand being vector and the other being the scalar
-// reduction chain.
-void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
- VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
+// Adjust the recipes for reductions. For in-loop reductions the chain of
+// instructions leading from the loop exit instr to the phi need to be converted
+// to reductions, with one operand being vector and the other being the scalar
+// reduction chain. For other reductions, a select is introduced between the phi
+// and live-out recipes when folding the tail.
+void LoopVectorizationPlanner::adjustRecipesForReductions(
+ VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
+ ElementCount MinVF) {
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
@@ -9468,6 +9651,8 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
VPValue *ChainOp = Plan->getVPValue(Chain);
unsigned FirstOpId;
+ assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+ "Only min/max recurrences allowed for inloop reductions");
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
"Expected to replace a VPWidenSelectSC");
@@ -9505,6 +9690,21 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
Chain = R;
}
}
+
+ // If tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the end of the latch.
+ if (CM.foldTailByMasking()) {
+ for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || PhiR->isInLoop())
+ continue;
+ Builder.setInsertPoint(LatchVPBB);
+ VPValue *Cond =
+ RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ VPValue *Red = PhiR->getBackedgeValue();
+ Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
+ }
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -9519,9 +9719,22 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
O << ", ";
Mask->printAsOperand(O, SlotTracker);
}
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i))
- O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i;
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " store ";
+ getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = load from index " << i;
+ }
+ ++OpIdx;
+ }
}
#endif
@@ -9605,17 +9818,20 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Reduction being replicated.");
Value *PrevInChain = State.get(getChainOp(), 0);
+ RecurKind Kind = RdxDesc->getRecurrenceKind();
+ bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
+ // Propagate the fast-math flags carried by the underlying instruction.
+ IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+ State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
- RecurKind Kind = RdxDesc->getRecurrenceKind();
- bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, Part);
VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
- Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+ Value *Iden = RdxDesc->getRecurrenceIdentity(
Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
- Constant *IdenVec =
- ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+ Value *IdenVec =
+ State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
NewVecOp = Select;
}
@@ -9627,8 +9843,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
PrevInChain);
else
NewRed = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
- PrevInChain, NewVecOp);
+ (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
+ NewVecOp);
PrevInChain = NewRed;
} else {
PrevInChain = State.get(getChainOp(), Part);
@@ -9640,11 +9856,10 @@ void VPReductionRecipe::execute(VPTransformState &State) {
NewRed, PrevInChain);
} else if (IsOrdered)
NextInChain = NewRed;
- else {
+ else
NextInChain = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
+ (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
PrevInChain);
- }
State.set(this, NextInChain, Part);
}
}
@@ -9757,7 +9972,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
State.ILV->vectorizeMemoryInstruction(
&Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
- StoredValue, getMask());
+ StoredValue, getMask(), Consecutive, Reverse);
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9923,7 +10138,7 @@ static bool processLoopInVPlanNativePath(
VectorizationFactor::Disabled() == VF)
return false;
- LVP.setBestPlan(VF.Width, 1);
+ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
{
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
@@ -9932,7 +10147,7 @@ static bool processLoopInVPlanNativePath(
&CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(LB, DT);
+ LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -10103,7 +10318,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
+ bool AllowOrderedReductions;
+ // If the flag is set, use that instead and override the TTI behaviour.
+ if (ForceOrderedReductions.getNumOccurrences() > 0)
+ AllowOrderedReductions = ForceOrderedReductions;
+ else
+ AllowOrderedReductions = TTI->enableOrderedReductions();
+ if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
ORE->emit([&]() {
auto *ExactFPMathInst = Requirements.getExactFPInst();
return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
@@ -10248,7 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
F->getParent()->getDataLayout());
if (!VF.Width.isScalar() || IC > 1)
Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
- LVP.setBestPlan(VF.Width, IC);
using namespace ore;
if (!VectorizeLoop) {
@@ -10257,7 +10477,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM, BFI, PSI, Checks);
- LVP.executePlan(Unroller, DT);
+
+ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+ LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10276,14 +10498,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
- EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
- EpilogueVF.Width.getKnownMinValue(),
- 1);
+ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks);
- LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
- LVP.executePlan(MainILV, DT);
+ VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+ LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
+ DT);
++LoopsVectorized;
simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
@@ -10291,13 +10512,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
- LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks);
- LVP.executePlan(EpilogILV, DT);
+
+ VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+ LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
+ DT);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
@@ -10305,7 +10528,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM, BFI, PSI, Checks);
- LVP.executePlan(LB, DT);
+
+ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
@@ -10423,15 +10648,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- MemorySSA *MSSA = EnableMSSALoopDependency
- ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
- : nullptr;
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
- TLI, TTI, nullptr, MSSA};
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
+ TLI, TTI, nullptr, nullptr, nullptr};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
@@ -10455,3 +10677,14 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserveSet<CFGAnalyses>();
return PA;
}
+
+void LoopVectorizePass::printPipeline(
+ raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+ static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
+ OS, MapClassName2PassName);
+
+ OS << "<";
+ OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
+ OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
+ OS << ">";
+}