aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2149
1 files changed, 1039 insertions, 1110 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3290439ecd07..b637b2d5ddae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,7 +58,6 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanHCFGBuilder.h"
-#include "VPlanPredicator.h"
#include "VPlanTransforms.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -112,7 +111,6 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
@@ -144,10 +142,10 @@
#include <algorithm>
#include <cassert>
#include <cstdint>
-#include <cstdlib>
#include <functional>
#include <iterator>
#include <limits>
+#include <map>
#include <memory>
#include <string>
#include <tuple>
@@ -346,13 +344,6 @@ cl::opt<bool> EnableVPlanNativePath(
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
-// FIXME: Remove this switch once we have divergence analysis. Currently we
-// assume divergent non-backedge branches when this switch is true.
-cl::opt<bool> EnableVPlanPredication(
- "enable-vplan-predication", cl::init(false), cl::Hidden,
- cl::desc("Enable VPlan-native vectorization path predicator with "
- "support for outer loop vectorization."));
-
// This flag enables the stress testing of the VPlan H-CFG construction in the
// VPlan-native vectorization path. It must be used in conjuction with
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
@@ -481,7 +472,7 @@ public:
VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
- void fixVectorizedLoop(VPTransformState &State);
+ void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -491,12 +482,6 @@ public:
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
- /// Vectorize a single first-order recurrence or pointer induction PHINode in
- /// a block. This method handles the induction variable canonicalization. It
- /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
- VPTransformState &State);
-
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
@@ -506,13 +491,6 @@ public:
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type. \p CanonicalIV is the scalar value generated for
- /// the canonical induction variable.
- void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
- VPTransformState &State, Value *CanonicalIV);
-
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
VPTransformState &State);
@@ -527,13 +505,8 @@ public:
ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
- /// Set the debug location in the builder \p Ptr using the debug location in
- /// \p V. If \p Ptr is None then it uses the class member's Builder.
- void setDebugLocFromInst(const Value *V,
- Optional<IRBuilder<> *> CustomBuilder = None);
-
- /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
- void fixNonInductionPHIs(VPTransformState &State);
+ /// Fix the non-induction PHIs in \p Plan.
+ void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
/// Returns true if the reordering of FP operations is not allowed, but we are
/// able to vectorize with strict in-order reductions for the given RdxDesc.
@@ -546,17 +519,6 @@ public:
/// element.
virtual Value *getBroadcastInstrs(Value *V);
- /// Add metadata from one instruction to another.
- ///
- /// This includes both the original MDs from \p From and additional ones (\see
- /// addNewMetadata). Use this for *newly created* instructions in the vector
- /// loop.
- void addMetadata(Instruction *To, Instruction *From);
-
- /// Similar to the previous function but it adds the metadata to a
- /// vector of instructions.
- void addMetadata(ArrayRef<Value *> To, Instruction *From);
-
// Returns the resume value (bc.merge.rdx) for a reduction as
// generated by fixReduction.
PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
@@ -575,13 +537,9 @@ protected:
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock);
-
- /// Introduce a conditional branch (on true, condition to be set later) at the
- /// end of the header=latch connecting it to itself (across the backedge) and
- /// to the exit block of \p L.
- void createHeaderBranch(Loop *L);
+ Value *VectorTripCount, Value *EndValue,
+ BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
+ VPlan &Plan);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
@@ -595,16 +553,9 @@ protected:
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
/// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+ void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
VPTransformState &State);
- /// Fixup the LCSSA phi nodes in the unique exit block. This simply
- /// means we need to add the appropriate incoming value from the middle
- /// block as exiting edges from the scalar epilogue loop (if present) are
- /// already in place, and we exit the vector loop exclusively to the middle
- /// block.
- void fixLCSSAPHIs(VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -613,30 +564,11 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// Compute scalar induction steps. \p ScalarIV is the scalar induction
- /// variable on which to base the steps, \p Step is the size of the step, and
- /// \p EntryVal is the value from the original loop that maps to the steps.
- /// Note that \p EntryVal doesn't have to be an induction variable - it
- /// can also be a truncate instruction.
- void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
- const InductionDescriptor &ID, VPValue *Def,
- VPTransformState &State);
-
- /// Create a vector induction phi node based on an existing scalar one. \p
- /// EntryVal is the value from the original loop that maps to the vector phi
- /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
- /// truncate instruction, instead of widening the original IV, we widen a
- /// version of the IV truncated to \p EntryVal's type.
- void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
- Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def,
- VPTransformState &State);
-
/// Returns (and creates if needed) the original loop trip count.
- Value *getOrCreateTripCount(Loop *NewLoop);
+ Value *getOrCreateTripCount(BasicBlock *InsertBlock);
/// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(Loop *NewLoop);
+ Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
/// Returns a bitcasted value to the requested vector type.
/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
@@ -645,33 +577,21 @@ protected:
/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
- void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+ void emitIterationCountCheck(BasicBlock *Bypass);
/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct. Returns the block containing the checks or
/// nullptr if no checks have been added.
- BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+ BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
/// Emit bypass checks to check any memory assumptions we may have made.
/// Returns the block containing the checks or nullptr if no checks have been
/// added.
- BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
- /// Compute the transformed value of Index at offset StartValue using step
- /// StepValue.
- /// For integer induction, returns StartValue + Index * StepValue.
- /// For pointer induction, returns StartValue[Index * StepValue].
- /// FIXME: The newly created binary instructions should contain nsw/nuw
- /// flags, which can be found from the original scalar operations.
- Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
- const DataLayout &DL,
- const InductionDescriptor &ID,
- BasicBlock *VectorHeader) const;
+ BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader. Also
- /// allocate a loop object for the new vector loop and return it.
- Loop *createVectorLoopSkeleton(StringRef Prefix);
+ /// vector loop preheader, middle block and scalar preheader.
+ void createVectorLoopSkeleton(StringRef Prefix);
/// Create new phi nodes for the induction variables to resume iteration count
/// in the scalar epilogue, from where the vectorized loop left off.
@@ -680,21 +600,12 @@ protected:
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
- Loop *L,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
/// conditional branches in the middle block, preparing the builder and
- /// running the verifier. Take in the vector loop \p L as argument, and return
- /// the preheader of the completed vector loop.
- BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
-
- /// Add additional metadata to \p To that was not present on \p Orig.
- ///
- /// Currently this is used to add the noalias annotations based on the
- /// inserted memchecks. Use this for instructions that are *cloned* into the
- /// vector loop.
- void addNewMetadata(Instruction *To, const Instruction *Orig);
+ /// running the verifier. Return the preheader of the completed vector loop.
+ BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
/// Collect poison-generating recipes that may generate a poison value that is
/// used after vectorization, even when their operands are not poison. Those
@@ -741,13 +652,6 @@ protected:
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
- /// LoopVersioning. It's only set up (non-null) if memchecks were
- /// used.
- ///
- /// This is currently only used to add no-alias metadata based on the
- /// memchecks. The actually versioning is performed manually.
- std::unique_ptr<LoopVersioning> LVer;
-
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
ElementCount VF;
@@ -774,9 +678,6 @@ protected:
/// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
- /// The vector loop body.
- BasicBlock *LoopVectorBody;
-
/// The scalar loop body.
BasicBlock *LoopScalarBody;
@@ -805,10 +706,6 @@ protected:
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
- // Vector of original scalar PHIs whose corresponding widened PHIs need to be
- // fixed up at the end of vector code generation.
- SmallVector<PHINode *, 8> OrigPHIsToFix;
-
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
@@ -936,8 +833,7 @@ protected:
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
- BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
- bool ForEpilogue);
+ BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
};
@@ -956,7 +852,9 @@ public:
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
GeneratedRTChecks &Checks)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {}
+ EPI, LVL, CM, BFI, PSI, Checks) {
+ TripCount = EPI.TripCount;
+ }
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -966,7 +864,7 @@ protected:
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
/// the vector epilogue or the scalar epilogue.
- BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+ BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
BasicBlock *Bypass,
BasicBlock *Insert);
void printDebugTracesAtStart() override;
@@ -993,31 +891,6 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
return I;
}
-void InnerLoopVectorizer::setDebugLocFromInst(
- const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
- IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
- const DILocation *DIL = Inst->getDebugLoc();
-
- // When a FSDiscriminator is enabled, we don't need to add the multiply
- // factors to the discriminators.
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
- // FIXME: For scalable vectors, assume vscale=1.
- auto NewDIL =
- DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
- if (NewDIL)
- B->SetCurrentDebugLocation(NewDIL.getValue());
- else
- LLVM_DEBUG(dbgs()
- << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- } else
- B->SetCurrentDebugLocation(DIL);
- } else
- B->SetCurrentDebugLocation(DebugLoc());
-}
-
/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
@@ -1059,7 +932,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
namespace llvm {
/// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
@@ -1067,12 +940,13 @@ Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
}
/// Return the runtime value for VF.
-Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(EC) : EC;
}
-static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
+ ElementCount VF) {
assert(FTy->isFloatingPointTy() && "Expected floating point type!");
Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
@@ -1119,14 +993,6 @@ static std::string getDebugLocString(const Loop *L) {
}
#endif
-void InnerLoopVectorizer::addNewMetadata(Instruction *To,
- const Instruction *Orig) {
- // If the loop was versioned with memchecks, add the corresponding no-alias
- // metadata.
- if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
- LVer->annotateInstWithNoAlias(To, Orig);
-}
-
void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
VPTransformState &State) {
@@ -1151,6 +1017,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
// handled.
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
isa<VPInterleaveRecipe>(CurRec) ||
+ isa<VPScalarIVStepsRecipe>(CurRec) ||
isa<VPCanonicalIVPHIRecipe>(CurRec))
continue;
@@ -1176,10 +1043,10 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &Recipe : *VPBB) {
if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
- Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+ Instruction &UnderlyingInstr = WidenRec->getIngredient();
VPDef *AddrDef = WidenRec->getAddr()->getDef();
- if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
- Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+ if (AddrDef && WidenRec->isConsecutive() &&
+ Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
collectPoisonGeneratingInstrsInBackwardSlice(
cast<VPRecipeBase>(AddrDef));
} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
@@ -1206,20 +1073,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
}
-void InnerLoopVectorizer::addMetadata(Instruction *To,
- Instruction *From) {
- propagateMetadata(To, From);
- addNewMetadata(To, From);
-}
-
-void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
- Instruction *From) {
- for (Value *V : To) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- addMetadata(I, From);
- }
-}
-
PHINode *InnerLoopVectorizer::getReductionResumeValue(
const RecurrenceDescriptor &RdxDesc) {
auto It = ReductionResumeValues.find(&RdxDesc);
@@ -1363,7 +1216,7 @@ public:
/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
- bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
return !Hints->allowReordering() && RdxDesc.isOrdered();
}
@@ -1701,6 +1554,11 @@ public:
private:
unsigned NumPredStores = 0;
+ /// Convenience function that returns the value of vscale_range iff
+ /// vscale_range.min == vscale_range.max or otherwise returns the value
+ /// returned by the corresponding TLI method.
+ Optional<unsigned> getVScaleForTuning() const;
+
/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -1713,15 +1571,10 @@ private:
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
- /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
- /// issue that occurred on one of the buildbots which cannot be reproduced
- /// without having access to the properietary compiler (see comments on
- /// D98509). The issue is currently under investigation and this workaround
- /// will be removed as soon as possible.
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType,
unsigned WidestType,
- const ElementCount &MaxSafeVF,
+ ElementCount MaxSafeVF,
bool FoldTailByMasking);
/// \return the maximum legal scalable VF, based on the safe max number
@@ -2012,7 +1865,7 @@ public:
/// there is no vector code generation, the check blocks are removed
/// completely.
void Create(Loop *L, const LoopAccessInfo &LAI,
- const SCEVUnionPredicate &UnionPred) {
+ const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
BasicBlock *LoopHeader = L->getHeader();
BasicBlock *Preheader = L->getLoopPreheader();
@@ -2035,9 +1888,19 @@ public:
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
- MemRuntimeCheckCond =
- addRuntimeChecks(MemCheckBlock->getTerminator(), L,
- RtPtrChecking.getChecks(), MemCheckExp);
+ auto DiffChecks = RtPtrChecking.getDiffChecks();
+ if (DiffChecks) {
+ MemRuntimeCheckCond = addDiffRuntimeChecks(
+ MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
+ [VF](IRBuilderBase &B, unsigned Bits) {
+ return getRuntimeVF(B, B.getIntNTy(Bits), VF);
+ },
+ IC);
+ } else {
+ MemRuntimeCheckCond =
+ addRuntimeChecks(MemCheckBlock->getTerminator(), L,
+ RtPtrChecking.getChecks(), MemCheckExp);
+ }
assert(MemRuntimeCheckCond &&
"no RT checks generated although RtPtrChecking "
"claimed checks are required");
@@ -2109,12 +1972,16 @@ public:
/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
/// adjusts the branches to branch to the vector preheader or \p Bypass,
/// depending on the generated condition.
- BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
BasicBlock *LoopVectorPreHeader,
BasicBlock *LoopExitBlock) {
if (!SCEVCheckCond)
return nullptr;
- if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
+
+ Value *Cond = SCEVCheckCond;
+ // Mark the check as used, to prevent it from being removed during cleanup.
+ SCEVCheckCond = nullptr;
+ if (auto *C = dyn_cast<ConstantInt>(Cond))
if (C->isZero())
return nullptr;
@@ -2133,18 +2000,15 @@ public:
DT->addNewBlock(SCEVCheckBlock, Pred);
DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
- ReplaceInstWithInst(
- SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
- // Mark the check as used, to prevent it from being removed during cleanup.
- SCEVCheckCond = nullptr;
+ ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
return SCEVCheckBlock;
}
/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
/// the branches to branch to the vector preheader or \p Bypass, depending on
/// the generated condition.
- BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
+ BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
BasicBlock *LoopVectorPreHeader) {
// Check if we generated code that checks in runtime if arrays overlap.
if (!MemRuntimeCheckCond)
@@ -2341,7 +2205,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
/// \p Opcode is relevant for FP induction variable.
static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
Instruction::BinaryOps BinOp, ElementCount VF,
- IRBuilder<> &Builder) {
+ IRBuilderBase &Builder) {
assert(VF.isVector() && "only vector VFs are supported");
// Create and check the types.
@@ -2357,9 +2221,8 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
// Create a vector of consecutive numbers from zero to VF.
VectorType *InitVecValVTy = ValVTy;
- Type *InitVecValSTy = STy;
if (STy->isFloatingPointTy()) {
- InitVecValSTy =
+ Type *InitVecValSTy =
IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
}
@@ -2389,199 +2252,12 @@ static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
}
-void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
- const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
- IRBuilder<> &Builder = State.Builder;
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // Construct the initial value of the vector IV in the vector loop preheader
- auto CurrIP = Builder.saveIP();
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- if (isa<TruncInst>(EntryVal)) {
- assert(Start->getType()->isIntegerTy() &&
- "Truncation requires an integer type");
- auto *TruncType = cast<IntegerType>(EntryVal->getType());
- Step = Builder.CreateTrunc(Step, TruncType);
- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
- }
-
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
-
- // We create vector phi nodes for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (Step->getType()->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = II.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Multiply the vectorization factor by the step using integer or
- // floating-point arithmetic as appropriate.
- Type *StepType = Step->getType();
- Value *RuntimeVF;
- if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
- else
- RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
- Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
- // Create a vector splat to use in the induction update.
- //
- // FIXME: If the step is non-constant, we create the vector splat with
- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
- // handle a constant vector splat.
- Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(State.VF, Mul);
- Builder.restoreIP(CurrIP);
-
- // We may need to add the step a number of times, depending on the unroll
- // factor. The last of those goes into the PHI.
- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
- &*LoopVectorBody->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
- Instruction *LastInduction = VecInd;
- for (unsigned Part = 0; Part < UF; ++Part) {
- State.set(Def, LastInduction, Part);
-
- if (isa<TruncInst>(EntryVal))
- addMetadata(LastInduction, EntryVal);
-
- LastInduction = cast<Instruction>(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
- }
-
- // Move the last step to the end of the latch block. This ensures consistent
- // placement of all induction updates.
- auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
- LastInduction->moveBefore(Br);
- LastInduction->setName("vec.ind.next");
-
- VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
- VecInd->addIncoming(LastInduction, LoopVectorLatch);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(
- PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
- Value *CanonicalIV) {
- Value *Start = Def->getStartValue()->getLiveInIRValue();
- const InductionDescriptor &ID = Def->getInductionDescriptor();
- TruncInst *Trunc = Def->getTruncInst();
- IRBuilder<> &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- assert(!State.VF.isZero() && "VF must be non-zero");
-
- // The value from the original loop to which we are mapping the new induction
- // variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
- auto &DL = EntryVal->getModule()->getDataLayout();
-
- // Generate code for the induction step. Note that induction steps are
- // required to be loop-invariant
- auto CreateStepValue = [&](const SCEV *Step) -> Value * {
- assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
- "Induction step should be loop invariant");
- if (PSE.getSE()->isSCEVable(IV->getType())) {
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- return Exp.expandCodeFor(Step, Step->getType(),
- State.CFG.VectorPreHeader->getTerminator());
- }
- return cast<SCEVUnknown>(Step)->getValue();
- };
-
- // The scalar value to broadcast. This is derived from the canonical
- // induction variable. If a truncation type is given, truncate the canonical
- // induction variable and step. Otherwise, derive these values from the
- // induction descriptor.
- auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = CanonicalIV;
- Type *NeededType = IV->getType();
- if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
- ScalarIV =
- NeededType->isIntegerTy()
- ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
- : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
- State.CFG.PrevBB);
- ScalarIV->setName("offset.idx");
- }
- if (Trunc) {
- auto *TruncType = cast<IntegerType>(Trunc->getType());
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
- Step = Builder.CreateTrunc(Step, TruncType);
- }
- return ScalarIV;
- };
-
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
- Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
- // Now do the actual transformations, and start with creating the step value.
- Value *Step = CreateStepValue(ID.getStep());
- if (State.VF.isScalar()) {
- Value *ScalarIV = CreateScalarIV(Step);
- Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
- Step->getType()->getScalarSizeInBits());
-
- Instruction::BinaryOps IncOp = ID.getInductionOpcode();
- if (IncOp == Instruction::BinaryOpsEnd)
- IncOp = Instruction::Add;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *StartIdx = ConstantInt::get(ScalarTy, Part);
- Instruction::BinaryOps MulOp = Instruction::Mul;
- if (Step->getType()->isFloatingPointTy()) {
- StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
- MulOp = Instruction::FMul;
- }
-
- Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
- Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
- State.set(Def, EntryPart, Part);
- if (Trunc) {
- assert(!Step->getType()->isFloatingPointTy() &&
- "fp inductions shouldn't be truncated");
- addMetadata(EntryPart, Trunc);
- }
- }
- return;
- }
-
- // Create a new independent vector induction variable, if one is needed.
- if (Def->needsVectorIV())
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
-
- if (Def->needsScalarIV()) {
- // Create scalar steps that can be used by instructions we will later
- // scalarize. Note that the addition of the scalar steps will not increase
- // the number of instructions in the loop in the common case prior to
- // InstCombine. We will be trading one vector extract for each scalar step.
- Value *ScalarIV = CreateScalarIV(Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
- }
-}
-
-void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
- Instruction *EntryVal,
- const InductionDescriptor &ID,
- VPValue *Def,
- VPTransformState &State) {
- IRBuilder<> &Builder = State.Builder;
+/// Compute scalar induction steps. \p ScalarIV is the scalar induction
+/// variable on which to base the steps, \p Step is the size of the step.
+static void buildScalarSteps(Value *ScalarIV, Value *Step,
+ const InductionDescriptor &ID, VPValue *Def,
+ VPTransformState &State) {
+ IRBuilderBase &Builder = State.Builder;
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(State.VF.isVector() && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
@@ -2652,6 +2328,103 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
}
}
+// Generate code for the induction step. Note that induction steps are
+// required to be loop-invariant
+static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
+ Instruction *InsertBefore,
+ Loop *OrigLoop = nullptr) {
+ const DataLayout &DL = SE.getDataLayout();
+ assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
+ "Induction step should be loop invariant");
+ if (auto *E = dyn_cast<SCEVUnknown>(Step))
+ return E->getValue();
+
+ SCEVExpander Exp(SE, DL, "induction");
+ return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
+}
+
+/// Compute the transformed value of Index at offset StartValue using step
+/// StepValue.
+/// For integer induction, returns StartValue + Index * StepValue.
+/// For pointer induction, returns StartValue[Index * StepValue].
+/// FIXME: The newly created binary instructions should contain nsw/nuw
+/// flags, which can be found from the original scalar operations.
+static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
+ Value *StartValue, Value *Step,
+ const InductionDescriptor &ID) {
+ assert(Index->getType()->getScalarType() == Step->getType() &&
+ "Index scalar type does not match StepValue type");
+
+ // Note: the IR at this point is broken. We cannot use SE to create any new
+ // SCEV and then expand it, hoping that SCEV's simplification will give us
+ // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+ // lead to various SCEV crashes. So all we can do is to use builder and rely
+ // on InstCombine for future simplifications. Here we handle some trivial
+ // cases only.
+ auto CreateAdd = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isZero())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isZero())
+ return X;
+ return B.CreateAdd(X, Y);
+ };
+
+ // We allow X to be a vector type, in which case Y will potentially be
+ // splatted into a vector with the same element count.
+ auto CreateMul = [&B](Value *X, Value *Y) {
+ assert(X->getType()->getScalarType() == Y->getType() &&
+ "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isOne())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isOne())
+ return X;
+ VectorType *XVTy = dyn_cast<VectorType>(X->getType());
+ if (XVTy && !isa<VectorType>(Y->getType()))
+ Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
+ return B.CreateMul(X, Y);
+ };
+
+ switch (ID.getKind()) {
+ case InductionDescriptor::IK_IntInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for integer inductions yet");
+ assert(Index->getType() == StartValue->getType() &&
+ "Index type does not match StartValue type");
+ if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
+ return B.CreateSub(StartValue, Index);
+ auto *Offset = CreateMul(Index, Step);
+ return CreateAdd(StartValue, Offset);
+ }
+ case InductionDescriptor::IK_PtrInduction: {
+ assert(isa<Constant>(Step) &&
+ "Expected constant step for pointer induction");
+ return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
+ }
+ case InductionDescriptor::IK_FpInduction: {
+ assert(!isa<VectorType>(Index->getType()) &&
+ "Vector indices not supported for FP inductions yet");
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ auto InductionBinOp = ID.getInductionBinOp();
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *MulExp = B.CreateFMul(Step, Index);
+ return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
+ }
+ case InductionDescriptor::IK_NoInduction:
+ return nullptr;
+ }
+ llvm_unreachable("invalid enum");
+}
+
void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
const VPIteration &Instance,
VPTransformState &State) {
@@ -2734,7 +2507,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
- setDebugLocFromInst(AddrPart);
+ State.setDebugLocFromInst(AddrPart);
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2760,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
}
- setDebugLocFromInst(Instr);
+ State.setDebugLocFromInst(Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
Value *MaskForGaps = nullptr;
@@ -2915,8 +2688,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (!Instance.isFirstIteration())
return;
- setDebugLocFromInst(Instr);
-
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2933,21 +2704,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
Cloned->dropPoisonGeneratingFlags();
- State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
- Builder.GetInsertPoint());
+ if (Instr->getDebugLoc())
+ State.setDebugLocFromInst(Instr);
+
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (auto &I : enumerate(RepRecipe->operands())) {
auto InputInstance = Instance;
VPValue *Operand = I.value();
- if (State.Plan->isUniformAfterVectorization(Operand))
+ VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
+ if (OperandR && OperandR->isUniform())
InputInstance.Lane = VPLane::getFirstLane();
Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
}
- addNewMetadata(Cloned, Instr);
+ State.addNewMetadata(Cloned, Instr);
// Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
+ State.Builder.Insert(Cloned);
State.set(RepRecipe, Cloned, Instance);
@@ -2960,29 +2733,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
PredicatedInstructions.push_back(Cloned);
}
-void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
- BasicBlock *Header = L->getHeader();
- assert(!L->getLoopLatch() && "loop should not have a latch at this point");
-
- IRBuilder<> B(Header->getTerminator());
- Instruction *OldInst =
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- setDebugLocFromInst(OldInst, &B);
-
- // Connect the header to the exit and header blocks and replace the old
- // terminator.
- B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
-
- // Now we have two terminators. Remove the old one from the block.
- Header->getTerminator()->eraseFromParent();
-}
-
-Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
if (TripCount)
return TripCount;
- assert(L && "Create Trip Count for null loop.");
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ assert(InsertBlock);
+ IRBuilder<> Builder(InsertBlock->getTerminator());
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
@@ -3006,7 +2762,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
@@ -3014,22 +2770,23 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- L->getLoopPreheader()->getTerminator());
+ InsertBlock->getTerminator());
if (TripCount->getType()->isPointerTy())
TripCount =
CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
- L->getLoopPreheader()->getTerminator());
+ InsertBlock->getTerminator());
return TripCount;
}
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+Value *
+InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;
- Value *TC = getOrCreateTripCount(L);
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ Value *TC = getOrCreateTripCount(InsertBlock);
+ IRBuilder<> Builder(InsertBlock->getTerminator());
Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
@@ -3041,6 +2798,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
+ // For scalable vectors the VF is not guaranteed to be a power of 2, but this
+ // is accounted for in emitIterationCountCheck that adds an overflow check.
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
@@ -3103,9 +2862,8 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
- BasicBlock *Bypass) {
- Value *Count = getOrCreateTripCount(L);
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -3120,10 +2878,23 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
: ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
+ Type *CountTy = Count->getType();
Value *CheckMinIters = Builder.getFalse();
- if (!Cost->foldTailByMasking()) {
- Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
+ Value *Step = createStepForVF(Builder, CountTy, VF, UF);
+ if (!Cost->foldTailByMasking())
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+ else if (VF.isScalable()) {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+
+ // Get the maximum unsigned value for the type.
+ Value *MaxUIntTripCount =
+ ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
+ Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
+
+ // Don't execute the vector loop if (UMax - n) < (VF * UF).
+ CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
}
// Create new preheader for vector loop.
LoopVectorPreHeader =
@@ -3148,10 +2919,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
LoopBypassBlocks.push_back(TCCheckBlock);
}
-BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
BasicBlock *const SCEVCheckBlock =
- RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
+ RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
if (!SCEVCheckBlock)
return nullptr;
@@ -3176,14 +2947,13 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
return SCEVCheckBlock;
}
-BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
- BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
// VPlan-native path does not do any analysis for runtime checks currently.
if (EnableVPlanNativePath)
return nullptr;
BasicBlock *const MemCheckBlock =
- RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
+ RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
// Check if we generated code that checks in runtime if arrays overlap. We put
// the checks into a separate block to make the more common case of few
@@ -3197,7 +2967,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
"to vectorize.");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
- L->getStartLoc(), L->getHeader())
+ OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
<< "Code-size may be reduced by not forcing "
"vectorization, or by source-code modifications "
"eliminating the need for runtime checks "
@@ -3209,116 +2980,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
AddedSafetyChecks = true;
- // We currently don't use LoopVersioning for the actual loop cloning but we
- // still use it to add the noalias metadata.
- LVer = std::make_unique<LoopVersioning>(
- *Legal->getLAI(),
- Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
- DT, PSE.getSE());
- LVer->prepareNoAliasMetadata();
return MemCheckBlock;
}
-Value *InnerLoopVectorizer::emitTransformedIndex(
- IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
- const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
-
- SCEVExpander Exp(*SE, DL, "induction");
- auto Step = ID.getStep();
- auto StartValue = ID.getStartValue();
- assert(Index->getType()->getScalarType() == Step->getType() &&
- "Index scalar type does not match StepValue type");
-
- // Note: the IR at this point is broken. We cannot use SE to create any new
- // SCEV and then expand it, hoping that SCEV's simplification will give us
- // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
- // lead to various SCEV crashes. So all we can do is to use builder and rely
- // on InstCombine for future simplifications. Here we handle some trivial
- // cases only.
- auto CreateAdd = [&B](Value *X, Value *Y) {
- assert(X->getType() == Y->getType() && "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isZero())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isZero())
- return X;
- return B.CreateAdd(X, Y);
- };
-
- // We allow X to be a vector type, in which case Y will potentially be
- // splatted into a vector with the same element count.
- auto CreateMul = [&B](Value *X, Value *Y) {
- assert(X->getType()->getScalarType() == Y->getType() &&
- "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isOne())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isOne())
- return X;
- VectorType *XVTy = dyn_cast<VectorType>(X->getType());
- if (XVTy && !isa<VectorType>(Y->getType()))
- Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
- return B.CreateMul(X, Y);
- };
-
- // Get a suitable insert point for SCEV expansion. For blocks in the vector
- // loop, choose the end of the vector loop header (=VectorHeader), because
- // the DomTree is not kept up-to-date for additional blocks generated in the
- // vector loop. By using the header as insertion point, we guarantee that the
- // expanded instructions dominate all their uses.
- auto GetInsertPoint = [this, &B, VectorHeader]() {
- BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
- if (InsertBB != LoopVectorBody &&
- LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
- return VectorHeader->getTerminator();
- return &*B.GetInsertPoint();
- };
-
- switch (ID.getKind()) {
- case InductionDescriptor::IK_IntInduction: {
- assert(!isa<VectorType>(Index->getType()) &&
- "Vector indices not supported for integer inductions yet");
- assert(Index->getType() == StartValue->getType() &&
- "Index type does not match StartValue type");
- if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
- return B.CreateSub(StartValue, Index);
- auto *Offset = CreateMul(
- Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
- return CreateAdd(StartValue, Offset);
- }
- case InductionDescriptor::IK_PtrInduction: {
- assert(isa<SCEVConstant>(Step) &&
- "Expected constant step for pointer induction");
- return B.CreateGEP(
- ID.getElementType(), StartValue,
- CreateMul(Index,
- Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
- GetInsertPoint())));
- }
- case InductionDescriptor::IK_FpInduction: {
- assert(!isa<VectorType>(Index->getType()) &&
- "Vector indices not supported for FP inductions yet");
- assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
- auto InductionBinOp = ID.getInductionBinOp();
- assert(InductionBinOp &&
- (InductionBinOp->getOpcode() == Instruction::FAdd ||
- InductionBinOp->getOpcode() == Instruction::FSub) &&
- "Original bin op should be defined for FP induction");
-
- Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
- Value *MulExp = B.CreateFMul(StepValue, Index);
- return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
- "induction");
- }
- case InductionDescriptor::IK_NoInduction:
- return nullptr;
- }
- llvm_unreachable("invalid enum");
-}
-
-Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -3350,43 +3015,24 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
- // We intentionally don't let SplitBlock to update LoopInfo since
- // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
- // LoopVectorBody is explicitly added to the correct place few lines later.
- LoopVectorBody =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
- nullptr, nullptr, Twine(Prefix) + "vector.body");
-
- // Update dominator for loop exit.
+ // Update dominator for loop exit. During skeleton creation, only the vector
+ // pre-header and the middle block are created. The vector loop is entirely
+ // created during VPlan exection.
if (!Cost->requiresScalarEpilogue(VF))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
-
- // Create and register the new vector loop.
- Loop *Lp = LI->AllocateLoop();
- Loop *ParentLoop = OrigLoop->getParentLoop();
-
- // Insert the new loop into the loop nest and register the new basic blocks
- // before calling any utilities such as SCEV that require valid LoopInfo.
- if (ParentLoop) {
- ParentLoop->addChildLoop(Lp);
- } else {
- LI->addTopLevelLoop(Lp);
- }
- Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
- return Lp;
}
void InnerLoopVectorizer::createInductionResumeValues(
- Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
+ std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
- Value *VectorTripCount = getOrCreateVectorTripCount(L);
- assert(VectorTripCount && L && "Expected valid arguments");
+ Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+ assert(VectorTripCount && "Expected valid arguments");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3399,19 +3045,13 @@ void InnerLoopVectorizer::createInductionResumeValues(
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getTerminator());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = VectorTripCount;
} else {
- IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(LoopVectorPreHeader->getTerminator());
// Fast-math-flags propagate from the original induction instruction.
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
@@ -3420,10 +3060,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
- Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
- const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
- EndValue =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+ Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
+ Value *Step =
+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+ EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
@@ -3431,13 +3071,23 @@ void InnerLoopVectorizer::createInductionResumeValues(
B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
StepType, true);
- CRD =
- B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
+ Value *Step =
+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+ VTC =
+ B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
EndValueFromAdditionalBypass =
- emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
+ emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
EndValueFromAdditionalBypass->setName("ind.end");
}
}
+
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal =
+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+ LoopScalarPreHeader->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
@@ -3456,13 +3106,10 @@ void InnerLoopVectorizer::createInductionResumeValues(
}
}
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
- MDNode *OrigLoopID) {
- assert(L && "Expected valid loop.");
-
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
// The trip counts should be cached by now.
- Value *Count = getOrCreateTripCount(L);
- Value *VectorTripCount = getOrCreateVectorTripCount(L);
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
@@ -3487,14 +3134,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
}
- // Get ready to start creating new instructions into the vectorized body.
- assert(LoopVectorPreHeader == L->getLoopPreheader() &&
- "Inconsistent vector loop preheader");
- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
-
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
- LI->verify(*DT);
#endif
return LoopVectorPreHeader;
@@ -3517,7 +3158,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
|/ |
| v
| [ ] \
- | [ ]_| <-- vector loop.
+ | [ ]_| <-- vector loop (created during VPlan execution).
| |
| v
\ -[ ] <--- middle-block.
@@ -3544,34 +3185,32 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// simply happens to be prone to hitting this in practice. In theory, we
// can hit the same issue for any SCEV, or ValueTracking query done during
// mutation. See PR49900.
- getOrCreateTripCount(OrigLoop);
+ getOrCreateTripCount(OrigLoop->getLoopPreheader());
// Create an empty vector loop, and prepare basic blocks for the runtime
// checks.
- Loop *Lp = createVectorLoopSkeleton("");
+ createVectorLoopSkeleton("");
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop. This check also covers the case where the
// backedge-taken count is uint##_max: adding one to it will overflow leading
// to an incorrect trip count of zero. In this (rare) case we will also jump
// to the scalar loop.
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+ emitIterationCountCheck(LoopScalarPreHeader);
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- emitSCEVChecks(Lp, LoopScalarPreHeader);
+ emitSCEVChecks(LoopScalarPreHeader);
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
- createHeaderBranch(Lp);
+ emitMemRuntimeChecks(LoopScalarPreHeader);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(Lp);
+ createInductionResumeValues();
- return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+ return {completeLoopSkeleton(OrigLoopID), nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -3580,8 +3219,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// value for the IV when arriving directly from the middle block.
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock) {
+ Value *VectorTripCount, Value *EndValue,
+ BasicBlock *MiddleBlock,
+ BasicBlock *VectorHeader, VPlan &Plan) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
@@ -3608,8 +3248,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
for (User *U : OrigPhi->users()) {
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
- const DataLayout &DL =
- OrigLoop->getHeader()->getModule()->getDataLayout();
assert(isa<PHINode>(UI) && "Expected LCSSA form");
IRBuilder<> B(MiddleBlock->getTerminator());
@@ -3619,15 +3257,18 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
Value *CountMinusOne = B.CreateSub(
- CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+ VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
Value *CMO =
!II.getStep()->getType()->isIntegerTy()
? B.CreateCast(Instruction::SIToFP, CountMinusOne,
II.getStep()->getType())
: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
CMO->setName("cast.cmo");
+
+ Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
+ VectorHeader->getTerminator());
Value *Escape =
- emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
+ emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3640,8 +3281,10 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// In this case, if IV1 has an external use, we need to avoid adding both
// "last value of IV1" and "penultimate value of IV2". So, verify that we
// don't already have an incoming value for the middle block.
- if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+ if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
PHI->addIncoming(I.second, MiddleBlock);
+ Plan.removeLiveOut(PHI);
+ }
}
}
@@ -3920,18 +3563,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
}
}
-void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
+void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
+ VPlan &Plan) {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF.isVector())
truncateToMinimalBitwidths(State);
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (OrigPHIsToFix.size()) {
- assert(EnableVPlanNativePath &&
- "Unexpected non-induction PHIs for fixup in non VPlan-native path");
- fixNonInductionPHIs(State);
- }
+ if (EnableVPlanNativePath)
+ fixNonInductionPHIs(Plan, State);
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
@@ -3942,24 +3583,37 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
- // If we inserted an edge from the middle block to the unique exit block,
- // update uses outside the loop (phis) to account for the newly inserted
- // edge.
- if (!Cost->requiresScalarEpilogue(VF)) {
+ VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
+ if (Cost->requiresScalarEpilogue(VF)) {
+ // No edge from the middle block to the unique exit block has been inserted
+ // and there is nothing to fix from vector loop; phis should have incoming
+ // from scalar loop only.
+ Plan.clearLiveOuts();
+ } else {
+ // If we inserted an edge from the middle block to the unique exit block,
+ // update uses outside the loop (phis) to account for the newly inserted
+ // edge.
+
// Fix-up external users of the induction variables.
for (auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
- getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
- IVEndValues[Entry.first], LoopMiddleBlock);
-
- fixLCSSAPHIs(State);
+ getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
+ IVEndValues[Entry.first], LoopMiddleBlock,
+ VectorLoop->getHeader(), Plan);
}
+ // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
+ // in the exit block, so update the builder.
+ State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+ for (auto &KV : Plan.getLiveOuts())
+ KV.second->fixPhi(Plan, State);
+
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
// Remove redundant induction instructions.
- cse(LoopVectorBody);
+ cse(VectorLoop->getHeader());
// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
@@ -3974,9 +3628,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// For scalable vectorization we can't know at compile time how many iterations
// of the loop are handled in one vector iteration, so instead assume a pessimistic
// vscale of '1'.
- setProfileInfoAfterUnrolling(
- LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
- LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
+ setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
+ LI->getLoopFor(LoopScalarBody),
+ VF.getKnownMinValue() * UF);
}
void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
@@ -3986,7 +3640,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
- VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
+ VPBasicBlock *Header =
+ State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
fixReduction(ReductionPhi, State);
@@ -4102,8 +3757,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(
// and thus no phis which needed updated.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+ State.Plan->removeLiveOut(&LCSSAPhi);
+ }
}
void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
@@ -4117,14 +3774,14 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- setDebugLocFromInst(ReductionStartValue);
+ State.setDebugLocFromInst(ReductionStartValue);
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(RdxDesc, State);
+ clearReductionWrapFlags(PhiR, State);
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
@@ -4132,9 +3789,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- setDebugLocFromInst(LoopExitInst);
+ State.setDebugLocFromInst(LoopExitInst);
Type *PhiTy = OrigPhi->getType();
+
+ VPBasicBlock *LatchVPBB =
+ PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
+ BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
@@ -4142,17 +3803,20 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
- Value *Sel = nullptr;
+ SelectInst *Sel = nullptr;
for (User *U : VecLoopExitInst->users()) {
if (isa<SelectInst>(U)) {
assert(!Sel && "Reduction exit feeding two selects");
- Sel = U;
+ Sel = cast<SelectInst>(U);
} else
assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
}
assert(Sel && "Reduction exit feeds no select");
State.reset(LoopExitInstDef, Sel, Part);
+ if (isa<FPMathOperator>(Sel))
+ Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
+
// If the target can create a predicated operator for the reduction at no
// extra cost in the loop (for example a predicated vadd), it can be
// cheaper for the select to remain in the loop than be sunk out of it,
@@ -4164,8 +3828,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
TargetTransformInfo::ReductionFlags())) {
auto *VecRdxPhi =
cast<PHINode>(State.get(PhiR, Part));
- VecRdxPhi->setIncomingValueForBlock(
- LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
+ VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
}
}
}
@@ -4176,8 +3839,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(
- LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+ Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
VectorParts RdxParts(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = State.get(LoopExitInstDef, Part);
@@ -4208,7 +3870,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
- setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+ State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
if (PhiR->isOrdered())
ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
else {
@@ -4265,6 +3927,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// Set the resume value for this reduction
ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ StoreInst *NewSI =
+ Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ propagateMetadata(NewSI, SI);
+
+ // If the reduction value is used in other places,
+ // then let the code below create PHI's for that.
+ }
+
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
@@ -4273,8 +3946,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// fixFirstOrderRecurrence for a more complete explaination of the logic.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
+ if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ State.Plan->removeLiveOut(&LCSSAPhi);
+ }
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
@@ -4287,63 +3962,35 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
-void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
+void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
VPTransformState &State) {
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RK != RecurKind::Add && RK != RecurKind::Mul)
return;
- Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
- assert(LoopExitInstr && "null loop exit instruction");
- SmallVector<Instruction *, 8> Worklist;
- SmallPtrSet<Instruction *, 8> Visited;
- Worklist.push_back(LoopExitInstr);
- Visited.insert(LoopExitInstr);
+ SmallVector<VPValue *, 8> Worklist;
+ SmallPtrSet<VPValue *, 8> Visited;
+ Worklist.push_back(PhiR);
+ Visited.insert(PhiR);
while (!Worklist.empty()) {
- Instruction *Cur = Worklist.pop_back_val();
- if (isa<OverflowingBinaryOperator>(Cur))
- for (unsigned Part = 0; Part < UF; ++Part) {
- // FIXME: Should not rely on getVPValue at this point.
- Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
- cast<Instruction>(V)->dropPoisonGeneratingFlags();
+ VPValue *Cur = Worklist.pop_back_val();
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *V = State.get(Cur, Part);
+ if (!isa<OverflowingBinaryOperator>(V))
+ break;
+ cast<Instruction>(V)->dropPoisonGeneratingFlags();
}
- for (User *U : Cur->users()) {
- Instruction *UI = cast<Instruction>(U);
- if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
- Visited.insert(UI).second)
- Worklist.push_back(UI);
- }
- }
-}
-
-void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
- if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
- // Some phis were already hand updated by the reduction and recurrence
- // code above, leave them alone.
- continue;
-
- auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
- // Non-instruction incoming values will have only one value.
-
- VPLane Lane = VPLane::getFirstLane();
- if (isa<Instruction>(IncomingValue) &&
- !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
- VF))
- Lane = VPLane::getLastLaneForVF(VF);
-
- // Can be a loop invariant incoming value or the last scalar value to be
- // extracted from the vectorized loop.
- // FIXME: Should not rely on getVPValue at this point.
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- Value *lastIncomingValue =
- OrigLoop->isLoopInvariant(IncomingValue)
- ? IncomingValue
- : State.get(State.Plan->getVPValue(IncomingValue, true),
- VPIteration(UF - 1, Lane));
- LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
+ for (VPUser *U : Cur->users()) {
+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!UserRecipe)
+ continue;
+ for (VPValue *V : UserRecipe->definedValues())
+ if (Visited.insert(V).second)
+ Worklist.push_back(V);
+ }
}
}
@@ -4421,17 +4068,23 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
} while (Changed);
}
-void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
- for (PHINode *OrigPhi : OrigPHIsToFix) {
- VPWidenPHIRecipe *VPPhi =
- cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
- PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
- // Make sure the builder has a valid insert point.
- Builder.SetInsertPoint(NewPhi);
- for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
- VPValue *Inc = VPPhi->getIncomingValue(i);
- VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
- NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
+ VPTransformState &State) {
+ auto Iter = depth_first(
+ VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+ for (VPRecipeBase &P : VPBB->phis()) {
+ VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
+ if (!VPPhi)
+ continue;
+ PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
+ // Make sure the builder has a valid insert point.
+ Builder.SetInsertPoint(NewPhi);
+ for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
+ VPValue *Inc = VPPhi->getIncomingValue(i);
+ VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
+ NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
+ }
}
}
}
@@ -4441,139 +4094,6 @@ bool InnerLoopVectorizer::useOrderedReductions(
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
- VPWidenPHIRecipe *PhiR,
- VPTransformState &State) {
- PHINode *P = cast<PHINode>(PN);
- if (EnableVPlanNativePath) {
- // Currently we enter here in the VPlan-native path for non-induction
- // PHIs where all control flow is uniform. We simply widen these PHIs.
- // Create a vector phi with no operands - the vector phi operands will be
- // set at the end of vector code generation.
- Type *VecTy = (State.VF.isScalar())
- ? PN->getType()
- : VectorType::get(PN->getType(), State.VF);
- Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
- State.set(PhiR, VecPhi, 0);
- OrigPHIsToFix.push_back(P);
-
- return;
- }
-
- assert(PN->getParent() == OrigLoop->getHeader() &&
- "Non-header phis should have been handled elsewhere");
-
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #1: We create a new vector PHI node with no incoming edges. We'll use
- // this value when we vectorize all of the instructions that use the PHI.
-
- assert(!Legal->isReductionVariable(P) &&
- "reductions should be handled elsewhere");
-
- setDebugLocFromInst(P);
-
- // This PHINode must be an induction variable.
- // Make sure that we know about it.
- assert(Legal->getInductionVars().count(P) && "Not an induction variable");
-
- InductionDescriptor II = Legal->getInductionVars().lookup(P);
- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-
- auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
- PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
-
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- switch (II.getKind()) {
- case InductionDescriptor::IK_NoInduction:
- llvm_unreachable("Unknown induction");
- case InductionDescriptor::IK_IntInduction:
- case InductionDescriptor::IK_FpInduction:
- llvm_unreachable("Integer/fp induction is handled elsewhere.");
- case InductionDescriptor::IK_PtrInduction: {
- // Handle the pointer induction variable case.
- assert(P->getType()->isPointerTy() && "Unexpected type.");
-
- if (Cost->isScalarAfterVectorization(P, State.VF)) {
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd =
- Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
- assert((IsUniform || !State.VF.isScalable()) &&
- "Cannot scalarize a scalable VF");
- unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *PartStart =
- createStepForVF(Builder, PtrInd->getType(), VF, Part);
-
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Value *Idx = Builder.CreateAdd(
- PartStart, ConstantInt::get(PtrInd->getType(), Lane));
- Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
- Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
- DL, II, State.CFG.PrevBB);
- SclrGep->setName("next.gep");
- State.set(PhiR, SclrGep, VPIteration(Part, Lane));
- }
- }
- return;
- }
- assert(isa<SCEVConstant>(II.getStep()) &&
- "Induction step not a SCEV constant!");
- Type *PhiType = II.getStep()->getType();
-
- // Build a pointer phi
- Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
- Type *ScStValueType = ScalarStartValue->getType();
- PHINode *NewPointerPhi =
- PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
- NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
-
- // A pointer induction, performed by using a gep
- BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- Instruction *InductionLoc = LoopLatch->getTerminator();
- const SCEV *ScalarStep = II.getStep();
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- Value *ScalarStepValue =
- Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
- Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
- Value *NumUnrolledElems =
- Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
- Value *InductionGEP = GetElementPtrInst::Create(
- II.getElementType(), NewPointerPhi,
- Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
- InductionLoc);
- NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
-
- // Create UF many actual address geps that use the pointer
- // phi as base and a vectorized version of the step value
- // (<step*0, ..., step*N>) as offset.
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Type *VecPhiType = VectorType::get(PhiType, State.VF);
- Value *StartOffsetScalar =
- Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
- Value *StartOffset =
- Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
- // Create a vector of consecutive numbers from zero to VF.
- StartOffset =
- Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
-
- Value *GEP = Builder.CreateGEP(
- II.getElementType(), NewPointerPhi,
- Builder.CreateMul(
- StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
- "vector.gep"));
- State.set(PhiR, GEP, Part);
- }
- }
- }
-}
-
/// A helper function for checking whether an integer division-related
/// instruction may divide by zero (in which case it must be predicated if
/// executed conditionally in the scalar code).
@@ -4597,7 +4117,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPTransformState &State) {
assert(!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
Module *M = I.getParent()->getParent()->getParent();
auto *CI = cast<CallInst>(&I);
@@ -4627,13 +4147,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+ if (!UseVectorIntrinsic ||
+ !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
Arg = State.get(I.value(), Part);
- else {
+ else
Arg = State.get(I.value(), VPIteration(0, 0));
- if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
- TysForDecl.push_back(Arg->getType());
- }
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+ TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
}
@@ -4661,7 +4181,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
V->copyFastMathFlags(CI);
State.set(Def, V, Part);
- addMetadata(V, &I);
+ State.addMetadata(V, &I);
}
}
@@ -4672,6 +4192,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");
+ // This avoids any chances of creating a REPLICATE recipe during planning
+ // since that would result in generation of scalarized code during execution,
+ // which is not supported for scalable vectors.
+ if (VF.isScalable()) {
+ Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
+ return;
+ }
+
SmallSetVector<Instruction *, 8> Worklist;
// These sets are used to seed the analysis with pointers used by memory
@@ -4761,7 +4289,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
}
// Insert the forced scalars.
- // FIXME: Currently widenPHIInstruction() often creates a dead vector
+ // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
// induction variable when the PHI user is scalarized.
auto ForcedScalar = ForcedScalars.find(VF);
if (ForcedScalar != ForcedScalars.end())
@@ -4888,6 +4416,27 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
+ // If the group involves a non-integral pointer, we may not be able to
+ // losslessly cast all values to a common type.
+ unsigned InterleaveFactor = Group->getFactor();
+ bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ Instruction *Member = Group->getMember(i);
+ if (!Member)
+ continue;
+ auto *MemberTy = getLoadStoreType(Member);
+ bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
+ // Don't coerce non-integral pointers to integers or vice versa.
+ if (MemberNI != ScalarNI) {
+ // TODO: Consider adding special nullptr value case here
+ return false;
+ } else if (MemberNI && ScalarNI &&
+ ScalarTy->getPointerAddressSpace() !=
+ MemberTy->getPointerAddressSpace()) {
+ return false;
+ }
+ }
+
// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
// needs predication, or it was decided to use masking to deal with gaps
@@ -5170,7 +4719,7 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return true;
}
- if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ if (!PSE.getPredicate().isAlwaysTrue()) {
reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
"runtime SCEV checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
@@ -5461,14 +5010,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
}
- // For scalable vectors don't use tail folding for low trip counts or
- // optimizing for code size. We only permit this if the user has explicitly
- // requested it.
- if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
- ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
- MaxFactors.ScalableVF.isVector())
- MaxFactors.ScalableVF = ElementCount::getScalable(0);
-
// If we don't know the precise trip count, or if the trip count that we
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
@@ -5511,7 +5052,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
- const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
+ ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5556,9 +5097,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(ClampedConstTripCount);
}
+ TargetTransformInfo::RegisterKind RegKind =
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
- if (TTI.shouldMaximizeVectorBandwidth() ||
- (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+ TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
ComputeScalableMaxVF);
@@ -5596,10 +5140,27 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
MaxVF = MinVF;
}
}
+
+ // Invalidate any widening decisions we might have made, in case the loop
+ // requires prediction (decided later), but we have already made some
+ // load/store widening decisions.
+ invalidateCostModelingDecisions();
}
return MaxVF;
}
+Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
+ if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+ auto Min = Attr.getVScaleRangeMin();
+ auto Max = Attr.getVScaleRangeMax();
+ if (Max && Min == Max)
+ return Max;
+ }
+
+ return TTI.getVScaleForTuning();
+}
+
bool LoopVectorizationCostModel::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
@@ -5624,7 +5185,7 @@ bool LoopVectorizationCostModel::isMoreProfitable(
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
- if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+ if (Optional<unsigned> VScale = getVScaleForTuning()) {
if (A.Width.isScalable())
EstimatedWidthA *= VScale.getValue();
if (B.Width.isScalable())
@@ -5651,7 +5212,8 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
assert(VFCandidates.count(ElementCount::getFixed(1)) &&
"Expected Scalar VF to be a candidate");
- const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
+ const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
+ ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -5669,12 +5231,12 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
continue;
VectorizationCostTy C = expectedCost(i, &InvalidCosts);
- VectorizationFactor Candidate(i, C.first);
+ VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
- if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
- AssumedMinimumVscale = VScale.getValue();
+ if (Optional<unsigned> VScale = getVScaleForTuning())
+ AssumedMinimumVscale = *VScale;
unsigned Width =
Candidate.Width.isScalable()
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
@@ -5862,7 +5424,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
if (LVP.hasPlanWithVF(ForcedEC))
- return {ForcedEC, 0};
+ return {ForcedEC, 0, 0};
else {
LLVM_DEBUG(
dbgs()
@@ -5885,8 +5447,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
+ // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
+ // the main loop handles 8 lanes per iteration. We could still benefit from
+ // vectorizing the epilogue loop with VF=4.
+ ElementCount EstimatedRuntimeVF = MainLoopVF;
+ if (MainLoopVF.isScalable()) {
+ EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+ if (Optional<unsigned> VScale = getVScaleForTuning())
+ EstimatedRuntimeVF *= *VScale;
+ }
+
for (auto &NextVF : ProfitableVFs)
- if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+ ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
+ ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
(Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
@@ -6006,6 +5580,18 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
return 1;
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0) {
+ InstructionCost C = expectedCost(VF).first;
+ assert(C.isValid() && "Expected to have chosen a VF with valid cost");
+ LoopCost = *C.getValue();
+
+ // Loop body is free and there is no need for interleaving.
+ if (LoopCost == 0)
+ return 1;
+ }
+
RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
@@ -6097,16 +5683,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
assert(IC > 0 && "Interleave count must be greater than 0.");
- // If we did not calculate the cost for VF (because the user selected the VF)
- // then we calculate the cost of VF here.
- if (LoopCost == 0) {
- InstructionCost C = expectedCost(VF).first;
- assert(C.isValid() && "Expected to have chosen a VF with valid cost");
- LoopCost = *C.getValue();
- }
-
- assert(LoopCost && "Non-zero loop cost expected");
-
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
if (VF.isVector() && HasReductions) {
@@ -6114,9 +5690,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
return IC;
}
- // Note that if we've already vectorized the loop we will have done the
- // runtime check and so interleaving won't require further checks.
- bool InterleavingRequiresRuntimePointerCheck =
+ // For any scalar loop that either requires runtime checks or predication we
+ // are better off leaving this to the unroller. Note that if we've already
+ // vectorized the loop we will have done the runtime check and so interleaving
+ // won't require further checks.
+ bool ScalarInterleavingRequiresPredication =
+ (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ }));
+ bool ScalarInterleavingRequiresRuntimePointerCheck =
(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
// We want to interleave small loops in order to reduce the loop overhead and
@@ -6126,7 +5708,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
<< "LV: VF is " << VF << '\n');
const bool AggressivelyInterleaveReductions =
TTI.enableAggressiveInterleaving(HasReductions);
- if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+ if (!ScalarInterleavingRequiresRuntimePointerCheck &&
+ !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
@@ -6289,16 +5872,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
- // A lambda that gets the register usage for the given type and VF.
- const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+ auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
return 0;
- InstructionCost::CostType RegUsage =
- *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
- assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
- "Nonsensical values for register usage.");
- return RegUsage;
+ return TTI.getRegUsageForType(VectorType::get(Ty, VF));
};
for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -7049,10 +6626,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool TypeNotScalarized = false;
if (VF.isVector() && VectorTy->isVectorTy()) {
- unsigned NumParts = TTI.getNumberOfParts(VectorTy);
- if (NumParts)
- TypeNotScalarized = NumParts < VF.getKnownMinValue();
- else
+ if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
+ if (VF.isScalable())
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
+ // scalable registers are a distinct register class from scalar ones.
+ // If we ever find a target which wants to lower scalable vectors
+ // back to scalars, we'll need to update this code to explicitly
+ // ask TTI about the register class uses for each part.
+ TypeNotScalarized = NumParts <= VF.getKnownMinValue();
+ else
+ TypeNotScalarized = NumParts < VF.getKnownMinValue();
+ } else
C = InstructionCost::getInvalid();
}
return VectorizationCostTy(C, TypeNotScalarized);
@@ -7128,8 +6712,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
- assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
- "Cannot yet scalarize uniform stores");
Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
}
@@ -7487,8 +7069,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
InstWidening Decision = getWideningDecision(I, Width);
assert(Decision != CM_Unknown &&
"CM decision should be taken at this point");
- if (Decision == CM_Scalarize)
+ if (Decision == CM_Scalarize) {
+ if (VF.isScalable() && isa<StoreInst>(I))
+ // We can't scalarize a scalable vector store (even a uniform one
+ // currently), return an invalid cost so as to prevent vectorization.
+ return InstructionCost::getInvalid();
Width = ElementCount::getFixed(1);
+ }
}
VectorTy = ToVectorTy(getLoadStoreType(I), Width);
return getMemoryInstructionCost(I, VF);
@@ -7656,6 +7243,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ // Find all stores to invariant variables. Since they are going to sink
+ // outside the loop we do not need calculate cost for them.
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ ValuesToIgnore.insert(&I);
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
@@ -7757,7 +7354,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
if (VPlanBuildStressTest)
return VectorizationFactor::Disabled();
- return {VF, 0 /*Cost*/};
+ return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
}
LLVM_DEBUG(
@@ -7766,6 +7363,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
+ unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+ return (NumRuntimePointerChecks >
+ VectorizerParams::RuntimeMemoryCheckThreshold &&
+ !Hints.allowReordering()) ||
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7800,7 +7405,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));
- return {{UserVF, 0}};
+ return {{UserVF, 0, 0}};
} else
reportVectorizationInfo("UserVF ignored because of invalid costs.",
"InvalidCost", ORE, OrigLoop);
@@ -7834,30 +7439,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
-
- // Check if it is profitable to vectorize with runtime checks.
- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
- if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysisAliasing(
- DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Hints.emitRemarkWithHints();
- return VectorizationFactor::Disabled();
- }
- }
- return SelectedVF;
+ return CM.selectVectorizationFactor(VFCandidates);
}
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -7910,17 +7492,36 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
- DominatorTree *DT) {
+ DominatorTree *DT,
+ bool IsEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
// Perform the actual loop transformation.
- // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ // 1. Set up the skeleton for vectorization, including vector pre-header and
+ // middle block. The vector loop is created during VPlan execution.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
Value *CanonicalIVStartValue;
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
ILV.createVectorizedLoopSkeleton();
+
+ // Only use noalias metadata when using memory checks guaranteeing no overlap
+ // across all iterations.
+ const LoopAccessInfo *LAI = ILV.Legal->getLAI();
+ if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
+ !LAI->getRuntimePointerChecking()->getDiffChecks()) {
+
+ // We currently don't use LoopVersioning for the actual loop cloning but we
+ // still use it to add the noalias metadata.
+ // TODO: Find a better way to re-use LoopVersioning functionality to add
+ // metadata.
+ State.LVer = std::make_unique<LoopVersioning>(
+ *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
+ PSE.getSE());
+ State.LVer->prepareNoAliasMetadata();
+ }
+
ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -7936,7 +7537,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State);
+ CanonicalIVStartValue, State,
+ IsEpilogueVectorization);
+
BestVPlan.execute(&State);
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7947,8 +7550,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
- Loop *L = LI->getLoopFor(State.CFG.PrevBB);
- if (VectorizedLoopID.hasValue())
+ VPBasicBlock *HeaderVPBB =
+ BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
+ Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+ if (VectorizedLoopID)
L->setLoopID(VectorizedLoopID.getValue());
else {
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -7965,7 +7570,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
- ILV.fixVectorizedLoop(State);
+ ILV.fixVectorizedLoop(State, BestVPlan);
ILV.printDebugTracesAtEnd();
}
@@ -8036,22 +7641,31 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
std::pair<BasicBlock *, Value *>
EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
- Loop *Lp = createVectorLoopSkeleton("");
+
+ // Workaround! Compute the trip count of the original loop and cache it
+ // before we start modifying the CFG. This code has a systemic problem
+ // wherein it tries to run analysis over partially constructed IR; this is
+ // wrong, and not simply for SCEV. The trip count of the original loop
+ // simply happens to be prone to hitting this in practice. In theory, we
+ // can hit the same issue for any SCEV, or ValueTracking query done during
+ // mutation. See PR49900.
+ getOrCreateTripCount(OrigLoop->getLoopPreheader());
+ createVectorLoopSkeleton("");
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
EPI.EpilogueIterationCountCheck =
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+ emitIterationCountCheck(LoopScalarPreHeader, true);
EPI.EpilogueIterationCountCheck->setName("iter.check");
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
+ EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
// Generate the code that checks at runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+ EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
// Generate the iteration count check for the main loop, *after* the check
// for the epilogue loop, so that the path-length is shorter for the case
@@ -8060,19 +7674,17 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
// trip count. Note: the branch will get updated later on when we vectorize
// the epilogue.
EPI.MainLoopIterationCountCheck =
- emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+ emitIterationCountCheck(LoopScalarPreHeader, false);
// Generate the induction variable.
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- EPI.VectorTripCount = CountRoundDown;
- createHeaderBranch(Lp);
+ EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
// Skip induction resume value creation here because they will be created in
// the second pass. If we created them here, they wouldn't be used anyway,
// because the vplan in the second pass still contains the inductions from the
// original loop.
- return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
+ return {completeLoopSkeleton(OrigLoopID), nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8092,13 +7704,13 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
});
}
-BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
- Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
- assert(L && "Expected valid Loop.");
+BasicBlock *
+EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
+ bool ForEpilogue) {
assert(Bypass && "Expected valid bypass basic block.");
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
- Value *Count = getOrCreateTripCount(L);
+ Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -8157,7 +7769,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
- Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+ createVectorLoopSkeleton("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
@@ -8166,7 +7778,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
LoopVectorPreHeader =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, "vec.epilog.ph");
- emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+ emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
// Adjust the control flow taking the state info from the main loop
@@ -8238,9 +7850,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
EPI.MainLoopIterationCountCheck);
- // Generate the induction variable.
- createHeaderBranch(Lp);
-
// Generate induction resume values. These variables save the new starting
// indexes for the scalar loop. They are used to test if there are any tail
// iterations left once the vector loop has completed.
@@ -8248,15 +7857,15 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
+ createInductionResumeValues({VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
- return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
+ return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
}
BasicBlock *
EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
- Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+ BasicBlock *Bypass, BasicBlock *Insert) {
assert(EPI.TripCount &&
"Expected trip count to have been safed in the first pass.");
@@ -8397,7 +8006,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// constructing the desired canonical IV in the header block as its first
// non-phi instructions.
assert(CM.foldTailByMasking() && "must fold the tail");
- VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB =
+ Plan->getVectorLoopRegion()->getEntryBasicBlock();
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
@@ -8439,8 +8049,6 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
"Must be called with either a load or store");
auto willWiden = [&](ElementCount VF) -> bool {
- if (VF.isScalar())
- return false;
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, VF);
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
@@ -8477,11 +8085,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
Mask, Consecutive, Reverse);
}
-static VPWidenIntOrFpInductionRecipe *
-createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
- VPValue *Start, const InductionDescriptor &IndDesc,
- LoopVectorizationCostModel &CM, Loop &OrigLoop,
- VFRange &Range) {
+/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
+/// insert a recipe to expand the step for the induction recipe.
+static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
+ PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
+ const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
// Returns true if an instruction \p I should be scalarized instead of
// vectorized for the chosen vectorization factor.
auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
@@ -8489,18 +8098,6 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
CM.isProfitableToScalarize(I, VF);
};
- bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) {
- // Returns true if we should generate a scalar version of \p IV.
- if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
- return true;
- auto isScalarInst = [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
- };
- return any_of(PhiOrTrunc->users(), isScalarInst);
- },
- Range);
bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
return ShouldScalarizeInstruction(PhiOrTrunc, VF);
@@ -8508,30 +8105,38 @@ createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
Range);
assert(IndDesc.getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+ assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
+ "step must be loop invariant");
+
+ VPValue *Step =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
- NeedsScalarIV, !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
+ !NeedsScalarIVOnly);
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
!NeedsScalarIVOnly);
}
-VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
- PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
+VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
+ PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
- return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
- Range);
+ return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
+ *PSE.getSE(), *OrigLoop, Range);
+ // Check if this is pointer induction. If so, build the recipe for it.
+ if (auto *II = Legal->getPointerInductionDescriptor(Phi))
+ return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
+ *PSE.getSE());
return nullptr;
}
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
- TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
- VPlan &Plan) const {
+ TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8552,7 +8157,8 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
+ return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
+ *PSE.getSE(), *OrigLoop, Range);
}
return nullptr;
}
@@ -8569,13 +8175,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
return Operands[0];
}
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ // For in-loop reductions, we do not need to create an additional select.
+ VPValue *InLoopVal = nullptr;
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ PHINode *PhiOp =
+ dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
+ if (PhiOp && CM.isInLoopReduction(PhiOp)) {
+ assert(!InLoopVal && "Found more than one in-loop reduction!");
+ InLoopVal = Operands[In];
+ }
+ }
+
+ assert((!InLoopVal || NumIncoming == 2) &&
+ "Found an in-loop reduction for PHI with unexpected number of "
+ "incoming values");
+ if (InLoopVal)
+ return Operands[Operands[0] == InLoopVal ? 1 : 0];
+
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
SmallVector<VPValue *, 2> OperandsWithMask;
- unsigned NumIncoming = Phi->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
@@ -8681,6 +8304,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
case Instruction::URem:
case Instruction::Xor:
case Instruction::ZExt:
+ case Instruction::Freeze:
return true;
}
return false;
@@ -8806,14 +8430,14 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
Plan->removeVPValueFor(Instr);
Plan->addVPValue(Instr, PHIRecipe);
}
- auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
- VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
- VPBlockUtils::connectBlocks(Pred, Exit);
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exiting);
return Region;
}
@@ -8822,52 +8446,37 @@ VPRecipeOrVPValueTy
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
VFRange &Range, VPlanPtr &Plan) {
- // First, check for specific widening recipes that deal with calls, memory
- // operations, inductions and Phi nodes.
- if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
-
- if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
-
+ // First, check for specific widening recipes that deal with inductions, Phi
+ // nodes, calls and memory operations.
VPRecipeBase *Recipe;
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Operands, Plan);
- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+ if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
return toVPRecipeResult(Recipe);
VPHeaderPHIRecipe *PhiRecipe = nullptr;
- if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
- VPValue *StartV = Operands[0];
- if (Legal->isReductionVariable(Phi)) {
- const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars().find(Phi)->second;
- assert(RdxDesc.getRecurrenceStartValue() ==
- Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
- CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc));
- } else {
- PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
- }
-
- // Record the incoming value from the backedge, so we can add the incoming
- // value from the backedge after all recipes have been created.
- recordRecipeOf(cast<Instruction>(
- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
- PhisToFix.push_back(PhiRecipe);
+ assert((Legal->isReductionVariable(Phi) ||
+ Legal->isFirstOrderRecurrence(Phi)) &&
+ "can only widen reductions and first-order recurrences here");
+ VPValue *StartV = Operands[0];
+ if (Legal->isReductionVariable(Phi)) {
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
+ assert(RdxDesc.getRecurrenceStartValue() ==
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+ PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+ CM.isInLoopReduction(Phi),
+ CM.useOrderedReductions(RdxDesc));
} else {
- // TODO: record backedge value for remaining pointer induction phis.
- assert(Phi->getType()->isPointerTy() &&
- "only pointer phis should be handled here");
- assert(Legal->getInductionVars().count(Phi) &&
- "Not an induction variable");
- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
- VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
- PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
+ PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
}
+ // Record the incoming value from the backedge, so we can add the incoming
+ // value from the backedge after all recipes have been created.
+ recordRecipeOf(cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+ PhisToFix.push_back(PhiRecipe);
return toVPRecipeResult(PhiRecipe);
}
@@ -8876,6 +8485,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Range, *Plan)))
return toVPRecipeResult(Recipe);
+ // All widen recipes below deal only with VF > 1.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return nullptr;
+
+ if (auto *CI = dyn_cast<CallInst>(Instr))
+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+
+ if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+ return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8949,15 +8569,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
// BranchOnCount VPInstruction to the latch.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- bool HasNUW, bool IsVPlanNative) {
+ bool HasNUW) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddVPValue(StartIdx);
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
- if (IsVPlanNative)
- Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
Header->insert(CanonicalIVPHI, Header->begin());
auto *CanonicalIVIncrement =
@@ -8966,11 +8584,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
{CanonicalIVPHI}, DL);
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
- VPBasicBlock *EB = TopRegion->getExitBasicBlock();
- if (IsVPlanNative) {
- EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
- EB->setCondBit(nullptr);
- }
+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
EB->appendRecipe(CanonicalIVIncrement);
auto *BranchOnCount =
@@ -8979,6 +8593,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
EB->appendRecipe(BranchOnCount);
}
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
+ VPlan &Plan) {
+ BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
+ BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
+ // Only handle single-exit loops with unique exit blocks for now.
+ if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
+ return;
+
+ // Introduce VPUsers modeling the exit values.
+ for (PHINode &ExitPhi : ExitBB->phis()) {
+ Value *IncomingValue =
+ ExitPhi.getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
+ Plan.addLiveOut(&ExitPhi, V);
+ }
+}
+
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9007,7 +8641,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder.recordRecipeOf(Phi);
for (auto &R : ReductionOperations) {
RecipeBuilder.recordRecipeOf(R);
- // For min/max reducitons, where we have a pair of icmp/select, we also
+ // For min/max reductions, where we have a pair of icmp/select, we also
// need to record the ICmp recipe, so it can be removed later.
assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
"Only min/max recurrences allowed for inloop reductions");
@@ -9039,18 +8673,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, with separate header and latch blocks.
- VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+ // Create initial VPlan skeleton, starting with a block for the pre-header,
+ // followed by a region for the vector loop, followed by the middle block. The
+ // skeleton vector loop region contains a header and latch block.
+ VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
+ auto Plan = std::make_unique<VPlan>(Preheader);
+
+ VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- auto Plan = std::make_unique<VPlan>(TopRegion);
+ VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
+ VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+ VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
Instruction *DLInst =
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- !CM.foldTailByMasking(), false);
+ !CM.foldTailByMasking());
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -9063,11 +8704,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
unsigned VPBBsForBB = 0;
- VPBB->setName(BB->getName());
+ if (VPBB != HeaderVPBB)
+ VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
- // TODO: Model and preserve debug instrinsics in VPlan.
+ // TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
@@ -9085,6 +8727,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
auto OpRange = Plan->mapToVPValues(Instr->operands());
Operands = {OpRange.begin(), OpRange.end()};
}
+
+ // Invariant stores inside loop will be deleted and a single store
+ // with the final reduction value will be added to the exit block
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ continue;
+
if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
Instr, Operands, Range, Plan)) {
// If Instr can be simplified to an existing VPValue, use it.
@@ -9135,14 +8785,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
+ HeaderVPBB->setName("vector.body");
+
// Fold the last, empty block into its predecessor.
VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
assert(VPBB && "expected to fold last (empty) block");
// After here, VPBB should not be used.
VPBB = nullptr;
- assert(isa<VPRegionBlock>(Plan->getEntry()) &&
- !Plan->getEntry()->getEntryBasicBlock()->empty() &&
+ addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+
+ assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+ !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
@@ -9222,12 +8876,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+ adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
RecipeBuilder, Range.Start);
// Introduce a recipe to combine the incoming and previous values of a
// first-order recurrence.
- for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ for (VPRecipeBase &R :
+ Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
if (!RecurPhi)
continue;
@@ -9236,7 +8891,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBasicBlock *InsertBlock = PrevRecipe->getParent();
auto *Region = GetReplicateRegion(PrevRecipe);
if (Region)
- InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
+ InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
+ if (!InsertBlock) {
+ InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
+ VPBlockUtils::insertBlockAfter(InsertBlock, Region);
+ }
if (Region || PrevRecipe->isPhi())
Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
else
@@ -9283,13 +8942,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- // From this point onwards, VPlan-to-VPlan transformations may change the plan
- // in ways that accessing values using original IR values is incorrect.
- Plan->disableValue2VPValue();
-
- VPlanTransforms::sinkScalarOperands(*Plan);
- VPlanTransforms::mergeReplicateRegions(*Plan);
-
std::string PlanName;
raw_string_ostream RSO(PlanName);
ElementCount VF = Range.Start;
@@ -9303,10 +8955,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RSO.flush();
Plan->setName(PlanName);
+ // From this point onwards, VPlan-to-VPlan transformations may change the plan
+ // in ways that accessing values using original IR values is incorrect.
+ Plan->disableValue2VPValue();
+
+ VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
+ VPlanTransforms::sinkScalarOperands(*Plan);
+ VPlanTransforms::mergeReplicateRegions(*Plan);
+ VPlanTransforms::removeDeadRecipes(*Plan);
+ VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
+
// Fold Exit block into its predecessor if possible.
// TODO: Fold block earlier once all VPlan transforms properly maintain a
// VPBasicBlock as exit.
- VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+ VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
@@ -9331,23 +8993,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VF *= 2)
Plan->addVF(VF);
- if (EnableVPlanPredication) {
- VPlanPredicator VPP(*Plan);
- VPP.predicate();
-
- // Avoid running transformation to recipes until masked code generation in
- // VPlan-native path is in place.
- return Plan;
- }
-
SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
DeadInstructions, *PSE.getSE());
+ // Remove the existing terminator of the exiting block of the top-most region.
+ // A BranchOnCount will be added instead when adding the canonical IV recipes.
+ auto *Term =
+ Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
+ Term->eraseFromParent();
+
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- true, true);
+ true);
return Plan;
}
@@ -9399,7 +9058,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
- auto *CondOp = CM.foldTailByMasking()
+ auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
@@ -9441,7 +9100,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// dedicated latch block.
if (CM.foldTailByMasking()) {
Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
- for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+ for (VPRecipeBase &R :
+ Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || PhiR->isInLoop())
continue;
@@ -9493,7 +9153,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
void VPWidenSelectRecipe::execute(VPTransformState &State) {
auto &I = *cast<SelectInst>(getUnderlyingInstr());
- State.ILV->setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
@@ -9508,7 +9168,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
Value *Op1 = State.get(getOperand(2), Part);
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
State.set(this, Sel, Part);
- State.ILV->addMetadata(Sel, &I);
+ State.addMetadata(Sel, &I);
}
}
@@ -9542,7 +9202,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
- State.ILV->setDebugLocFromInst(&I);
+ State.setDebugLocFromInst(&I);
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Value *, 2> Ops;
@@ -9565,17 +9225,28 @@ void VPWidenRecipe::execute(VPTransformState &State) {
// Use this vector value for all users of the original instruction.
State.set(this, V, Part);
- State.ILV->addMetadata(V, &I);
+ State.addMetadata(V, &I);
}
break;
}
+ case Instruction::Freeze: {
+ State.setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Op = State.get(getOperand(0), Part);
+
+ Value *Freeze = Builder.CreateFreeze(Op);
+ State.set(this, Freeze, Part);
+ }
+ break;
+ }
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
bool FCmp = (I.getOpcode() == Instruction::FCmp);
auto *Cmp = cast<CmpInst>(&I);
- State.ILV->setDebugLocFromInst(Cmp);
+ State.setDebugLocFromInst(Cmp);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -9589,7 +9260,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
}
State.set(this, C, Part);
- State.ILV->addMetadata(C, &I);
+ State.addMetadata(C, &I);
}
break;
@@ -9608,7 +9279,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
auto *CI = cast<CastInst>(&I);
- State.ILV->setDebugLocFromInst(CI);
+ State.setDebugLocFromInst(CI);
/// Vectorize casts.
Type *DestTy = (State.VF.isScalar())
@@ -9619,7 +9290,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
Value *A = State.get(getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
State.set(this, Cast, Part);
- State.ILV->addMetadata(Cast, &I);
+ State.addMetadata(Cast, &I);
}
break;
}
@@ -9655,7 +9326,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
State.set(this, EntryPart, Part);
- State.ILV->addMetadata(EntryPart, GEP);
+ State.addMetadata(EntryPart, GEP);
}
} else {
// If the GEP has at least one loop-varying operand, we are sure to
@@ -9693,32 +9364,276 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
// but it should be a vector, otherwise.
- auto *NewGEP = IsInBounds
- ? State.Builder.CreateInBoundsGEP(
- GEP->getSourceElementType(), Ptr, Indices)
- : State.Builder.CreateGEP(GEP->getSourceElementType(),
- Ptr, Indices);
+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
+ Indices, "", IsInBounds);
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector");
State.set(this, NewGEP, Part);
- State.ILV->addMetadata(NewGEP, GEP);
+ State.addMetadata(NewGEP, GEP);
}
}
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
- State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
+
+ Value *Start = getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = getInductionDescriptor();
+ TruncInst *Trunc = getTruncInst();
+ IRBuilderBase &Builder = State.Builder;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(State.VF.isVector() && "must have vector VF");
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
+ // Now do the actual transformations, and start with fetching the step value.
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+
+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
+ Type *StepType = Step->getType();
+ Value *RuntimeVF;
+ if (Step->getType()->isFloatingPointTy())
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+ else
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*State.CFG.PrevBB->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ State.set(this, LastInduction, Part);
+
+ if (isa<TruncInst>(EntryVal))
+ State.addMetadata(LastInduction, EntryVal);
+
+ LastInduction = cast<Instruction>(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ LastInduction->setName("vec.ind.next");
+ VecInd->addIncoming(SteppedStart, VectorPH);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ VecInd->addIncoming(LastInduction, VectorPH);
+}
+
+void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
+ assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+ "Not a pointer induction according to InductionDescriptor!");
+ assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
+ "Unexpected type.");
+
+ auto *IVR = getParent()->getPlan()->getCanonicalIV();
+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
+ if (onlyScalarsGenerated(State.VF)) {
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd = State.Builder.CreateSExtOrTrunc(
+ CanonicalIV, IndDesc.getStep()->getType());
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If the instruction is uniform, we only need to generate the
+ // first lane. Otherwise, we generate all VF values.
+ bool IsUniform = vputils::onlyFirstLaneUsed(this);
+ assert((IsUniform || !State.VF.isScalable()) &&
+ "Cannot scalarize a scalable VF");
+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *PartStart =
+ createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
+
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ Value *Idx = State.Builder.CreateAdd(
+ PartStart, ConstantInt::get(PtrInd->getType(), Lane));
+ Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
+
+ Value *Step = CreateStepValue(IndDesc.getStep(), SE,
+ State.CFG.PrevBB->getTerminator());
+ Value *SclrGep = emitTransformedIndex(
+ State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
+ SclrGep->setName("next.gep");
+ State.set(this, SclrGep, VPIteration(Part, Lane));
+ }
+ }
+ return;
+ }
+
+ assert(isa<SCEVConstant>(IndDesc.getStep()) &&
+ "Induction step not a SCEV constant!");
+ Type *PhiType = IndDesc.getStep()->getType();
+
+ // Build a pointer phi
+ Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
+ Type *ScStValueType = ScalarStartValue->getType();
+ PHINode *NewPointerPhi =
+ PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+
+ // A pointer induction, performed by using a gep
+ const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
+ Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
+
+ const SCEV *ScalarStep = IndDesc.getStep();
+ SCEVExpander Exp(SE, DL, "induction");
+ Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
+ Value *NumUnrolledElems =
+ State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
+ Value *InductionGEP = GetElementPtrInst::Create(
+ IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+ InductionLoc);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+
+ // Create UF many actual address geps that use the pointer
+ // phi as base and a vectorized version of the step value
+ // (<step*0, ..., step*N>) as offset.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Type *VecPhiType = VectorType::get(PhiType, State.VF);
+ Value *StartOffsetScalar =
+ State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+ Value *StartOffset =
+ State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
+ // Create a vector of consecutive numbers from zero to VF.
+ StartOffset = State.Builder.CreateAdd(
+ StartOffset, State.Builder.CreateStepVector(VecPhiType));
+
+ Value *GEP = State.Builder.CreateGEP(
+ IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.CreateMul(
+ StartOffset,
+ State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
+ "vector.gep"));
+ State.set(this, GEP, Part);
+ }
}
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
- State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
- State);
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+ if (IndDesc.getInductionBinOp() &&
+ isa<FPMathOperator>(IndDesc.getInductionBinOp()))
+ State.Builder.setFastMathFlags(
+ IndDesc.getInductionBinOp()->getFastMathFlags());
+
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+ auto CreateScalarIV = [&](Value *&Step) -> Value * {
+ Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+ auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+ if (!isCanonical() || CanonicalIV->getType() != Ty) {
+ ScalarIV =
+ Ty->isIntegerTy()
+ ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
+ : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
+ ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
+ getStartValue()->getLiveInIRValue(), Step,
+ IndDesc);
+ ScalarIV->setName("offset.idx");
+ }
+ if (TruncToTy) {
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
+ Step = State.Builder.CreateTrunc(Step, TruncToTy);
+ }
+ return ScalarIV;
+ };
+
+ Value *ScalarIV = CreateScalarIV(Step);
+ if (State.VF.isVector()) {
+ buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
+ return;
+ }
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
+ Value *EntryPart;
+ if (Step->getType()->isFloatingPointTy()) {
+ Value *StartIdx =
+ getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
+ // Floating-point operations inherit FMF via the builder's flags.
+ Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
+ EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
+ ScalarIV, MulOp);
+ } else {
+ Value *StartIdx =
+ getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
+ EntryPart = State.Builder.CreateAdd(
+ ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
+ }
+ State.set(this, EntryPart, Part);
+ }
}
void VPBlendRecipe::execute(VPTransformState &State) {
- State.ILV->setDebugLocFromInst(Phi, &State.Builder);
+ State.setDebugLocFromInst(Phi);
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
@@ -9979,7 +9894,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// Handle Stores:
if (SI) {
- State.ILV->setDebugLocFromInst(SI);
+ State.setDebugLocFromInst(SI);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Instruction *NewSI = nullptr;
@@ -10005,14 +9920,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
else
NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
}
- State.ILV->addMetadata(NewSI, SI);
+ State.addMetadata(NewSI, SI);
}
return;
}
// Handle loads.
assert(LI && "Must have a load instruction");
- State.ILV->setDebugLocFromInst(LI);
+ State.setDebugLocFromInst(LI);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
@@ -10020,7 +9935,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
Value *VectorGep = State.get(getAddr(), Part);
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
- State.ILV->addMetadata(NewLI, LI);
+ State.addMetadata(NewLI, LI);
} else {
auto *VecPtr =
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
@@ -10033,12 +9948,12 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
// Add metadata to the load, but setVectorValue to the reverse shuffle.
- State.ILV->addMetadata(NewLI, LI);
+ State.addMetadata(NewLI, LI);
if (Reverse)
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
}
- State.set(this, NewLI, Part);
+ State.set(getVPSingleValue(), NewLI, Part);
}
}
@@ -10119,7 +10034,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
// At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
- assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
+ isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -10201,8 +10117,7 @@ static bool processLoopInVPlanNativePath(
// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.
// Also, do not attempt to vectorize if no vector code will be produced.
- if (VPlanBuildStressTest || EnableVPlanPredication ||
- VectorizationFactor::Disabled() == VF)
+ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
return false;
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
@@ -10214,7 +10129,7 @@ static bool processLoopInVPlanNativePath(
&CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -10282,8 +10197,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const std::string DebugLocStr = getDebugLocString(L);
#endif /* NDEBUG */
- LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
- << L->getHeader()->getParent()->getName() << "\" from "
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
+ << L->getHeader()->getParent()->getName() << "' from "
<< DebugLocStr << "\n");
LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
@@ -10438,10 +10353,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+ F->getParent()->getDataLayout());
if (MaybeVF) {
+ if (LVP.requiresTooManyRuntimeChecks()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(
+ DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
VF = *MaybeVF;
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+
+ unsigned SelectedIC = std::max(IC, UserIC);
+ // Optimistically generate runtime checks if they are needed. Drop them if
+ // they turn out to not be profitable.
+ if (VF.Width.isVector() || SelectedIC > 1)
+ Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
}
// Identify the diagnostic messages that should be produced.
@@ -10529,14 +10464,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();
{
- // Optimistically generate runtime checks. Drop them if they turn out to not
- // be profitable. Limit the scope of Checks, so the cleanup happens
- // immediately after vector codegeneration is done.
- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
- F->getParent()->getDataLayout());
- if (!VF.Width.isScalar() || IC > 1)
- Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
-
using namespace ore;
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -10546,7 +10473,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10571,12 +10498,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
- DT);
+ DT, true);
++LoopsVectorized;
- simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
- formLCSSARecursively(*L, *DT, LI, SE);
-
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
@@ -10586,23 +10510,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Checks);
VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+ VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
+ Header->setName("vec.epilog.vector.body");
// Ensure that the start values for any VPReductionPHIRecipes are
// updated before vectorising the epilogue loop.
- VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
if (auto *Resume = MainILV.getReductionResumeValue(
ReductionPhi->getRecurrenceDescriptor())) {
- VPValue *StartVal = new VPValue(Resume);
- BestEpiPlan.addExternalDef(StartVal);
+ VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
ReductionPhi->setOperand(0, StartVal);
}
}
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT);
+ DT, true);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
@@ -10612,7 +10537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
&LVL, &CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
@@ -10638,7 +10563,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
- if (RemainderLoopID.hasValue()) {
+ if (RemainderLoopID) {
L->setLoopID(RemainderLoopID.getValue());
} else {
if (DisableRuntimeUnroll)
@@ -10720,8 +10645,12 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
PreservedAnalyses LoopVectorizePass::run(Function &F,
FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
+ // There are no loops in the function. Return before computing other expensive
+ // analyses.
+ if (LI.empty())
+ return PreservedAnalyses::all();
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);