src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-07-14 18:58:48 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-02-08 19:03:59 +0000
commit	753f127f3ace09432b2baeffd71a308760641a62 (patch)
tree	97694ab339c0ca6145ebb429c7505019565b9a60 /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent	81ad626541db97eb356e2c1d4a20eb2a26a766ab (diff)
parent	1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

733

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

365

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

174

11 files changed, 1006 insertions, 574 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6242d9a93fc1..183ba86abcb4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -386,20 +386,6 @@ static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {

return true;

}

-/// Check whether it is safe to if-convert this phi node.

-///

-/// Phi nodes with constant expressions that can trap are not safe to if

-/// convert.

-static bool canIfConvertPHINodes(BasicBlock *BB) {

- for (PHINode &Phi : BB->phis()) {

- for (Value *V : Phi.incoming_values())

- if (auto *C = dyn_cast<Constant>(V))

- if (C->canTrap())

- return false;

- }

- return true;

static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {

if (Ty->isPointerTy())

return DL.getIntPtrType(Ty);

@@ -993,7 +979,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {

}

- Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());

PSE.addPredicate(LAI->getPSE().getPredicate());

return true;

}

@@ -1098,13 +1083,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(

SmallPtrSetImpl<const Instruction *> &MaskedOp,

SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {

for (Instruction &I : *BB) {

- // Check that we don't have a constant expression that can trap as operand.

- for (Value *Operand : I.operands()) {

- if (auto *C = dyn_cast<Constant>(Operand))

- if (C->canTrap())

- return false;

- }

// We can predicate blocks with calls to assume, as long as we drop them in

// case we flatten the CFG via predication.

if (match(&I, m_Intrinsic<Intrinsic::assume>())) {

@@ -1190,7 +1168,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

}

// Collect the blocks that need predication.

- BasicBlock *Header = TheLoop->getHeader();

for (BasicBlock *BB : TheLoop->blocks()) {

// We don't support switch statements inside loops.

if (!isa<BranchInst>(BB->getTerminator())) {

@@ -1212,13 +1189,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

BB->getTerminator());

return false;

}

- } else if (BB != Header && !canIfConvertPHINodes(BB)) {

- reportVectorizationFailure(

- "Control flow cannot be substituted for a select",

- "control flow cannot be substituted for a select",

- "NoCFGForSelect", ORE, TheLoop,

- BB->getTerminator());

- return false;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 0cb2032fa45a..2e9a9fe0640e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -33,7 +33,6 @@ class LoopInfo;

class LoopVectorizationLegality;

class LoopVectorizationCostModel;

class PredicatedScalarEvolution;

-class LoopVectorizationRequirements;

class LoopVectorizeHints;

class OptimizationRemarkEmitter;

class TargetTransformInfo;

@@ -46,8 +45,9 @@ class VPBuilder {

VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();

VPInstruction *createInstruction(unsigned Opcode,

- ArrayRef<VPValue *> Operands, DebugLoc DL) {

- VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);

+ ArrayRef<VPValue *> Operands, DebugLoc DL,

+ const Twine &Name = "") {

+ VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name);

if (BB)

BB->insert(Instr, InsertPt);

return Instr;

@@ -55,8 +55,8 @@ class VPBuilder {

VPInstruction *createInstruction(unsigned Opcode,

std::initializer_list<VPValue *> Operands,

- DebugLoc DL) {

- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);

+ DebugLoc DL, const Twine &Name = "") {

+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name);

}

public:

@@ -124,34 +124,37 @@ public:

/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as

/// its underlying Instruction.

VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,

- Instruction *Inst = nullptr) {

+ Instruction *Inst = nullptr, const Twine &Name = "") {

DebugLoc DL;

if (Inst)

DL = Inst->getDebugLoc();

- VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);

+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);

NewVPInst->setUnderlyingValue(Inst);

return NewVPInst;

}

VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,

- DebugLoc DL) {

- return createInstruction(Opcode, Operands, DL);

+ DebugLoc DL, const Twine &Name = "") {

+ return createInstruction(Opcode, Operands, DL, Name);

}

- VPValue *createNot(VPValue *Operand, DebugLoc DL) {

- return createInstruction(VPInstruction::Not, {Operand}, DL);

+ VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {

+ return createInstruction(VPInstruction::Not, {Operand}, DL, Name);

}

- VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {

- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);

+ VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL,

+ const Twine &Name = "") {

+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);

}

- VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {

- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);

+ VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL,

+ const Twine &Name = "") {

+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name);

}

VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,

- DebugLoc DL) {

- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);

+ DebugLoc DL, const Twine &Name = "") {

+ return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,

+ Name);

}

//===--------------------------------------------------------------------===//

@@ -191,6 +194,10 @@ struct VectorizationFactor {

/// Cost of the scalar loop.

InstructionCost ScalarCost;

+ /// The minimum trip count required to make vectorization profitable, e.g. due

+ /// to runtime checks.

+ ElementCount MinProfitableTripCount;

VectorizationFactor(ElementCount Width, InstructionCost Cost,

InstructionCost ScalarCost)

: Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}

@@ -268,8 +275,6 @@ class LoopVectorizationPlanner {

const LoopVectorizeHints &Hints;

- LoopVectorizationRequirements &Requirements;

OptimizationRemarkEmitter *ORE;

SmallVector<VPlanPtr, 4> VPlans;

@@ -285,10 +290,9 @@ public:

InterleavedAccessInfo &IAI,

PredicatedScalarEvolution &PSE,

const LoopVectorizeHints &Hints,

- LoopVectorizationRequirements &Requirements,

OptimizationRemarkEmitter *ORE)

: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),

- PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}

+ PSE(PSE), Hints(Hints), ORE(ORE) {}

/// Plan how to best vectorize, return the best VF and its cost, or None if

/// vectorization and interleaving should be avoided up front.

@@ -332,11 +336,6 @@ public:

bool requiresTooManyRuntimeChecks() const;

protected:

- /// Collect the instructions from the original loop that would be trivially

- /// dead in the vectorized loop if generated.

- void collectTriviallyDeadInstructions(

- SmallPtrSetImpl<Instruction *> &DeadInstructions);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,

/// according to the information gathered by Legal when it checked if it is

/// legal to vectorize the loop.

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b637b2d5ddae..0777a1385916 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(

"value are vectorized only if no scalar iteration overheads "

"are incurred."));

-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(

- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,

- cl::desc("The maximum allowed number of runtime memory checks with a "

- "vectorize(enable) pragma."));

+static cl::opt<unsigned> VectorizeMemoryCheckThreshold(

+ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,

+ cl::desc("The maximum allowed number of runtime memory checks"));

// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,

// that predication is preferred, and this lists all options. I.e., the

@@ -442,6 +441,7 @@ public:

const TargetLibraryInfo *TLI,

const TargetTransformInfo *TTI, AssumptionCache *AC,

OptimizationRemarkEmitter *ORE, ElementCount VecWidth,

+ ElementCount MinProfitableTripCount,

unsigned UnrollFactor, LoopVectorizationLegality *LVL,

LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,

ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)

@@ -453,6 +453,11 @@ public:

// of the original loop header may change as the transformation happens.

OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(

OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);

+ if (MinProfitableTripCount.isZero())

+ this->MinProfitableTripCount = VecWidth;

+ else

+ this->MinProfitableTripCount = MinProfitableTripCount;

}

virtual ~InnerLoopVectorizer() = default;

@@ -656,6 +661,8 @@ protected:

/// vector elements.

ElementCount VF;

+ ElementCount MinProfitableTripCount;

/// The vectorization unroll factor to use. Each scalar is vectorized to this

/// many different vector instructions.

unsigned UF;

@@ -735,6 +742,7 @@ public:

LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,

ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)

: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,

+ ElementCount::getFixed(1),

ElementCount::getFixed(1), UnrollFactor, LVL, CM,

BFI, PSI, Check) {}

@@ -783,8 +791,8 @@ public:

BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,

GeneratedRTChecks &Checks)

: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,

- EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,

- Checks),

+ EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,

+ CM, BFI, PSI, Checks),

EPI(EPI) {}

// Override this function to handle the more complex control flow around the

@@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||

isa<VPInterleaveRecipe>(CurRec) ||

isa<VPScalarIVStepsRecipe>(CurRec) ||

- isa<VPCanonicalIVPHIRecipe>(CurRec))

+ isa<VPCanonicalIVPHIRecipe>(CurRec) ||

+ isa<VPActiveLaneMaskPHIRecipe>(CurRec))

continue;

// This recipe contributes to the address computation of a widen

@@ -1503,6 +1512,13 @@ public:

/// Returns true if all loop blocks should be masked to fold tail loop.

bool foldTailByMasking() const { return FoldTailByMasking; }

+ /// Returns true if were tail-folding and want to use the active lane mask

+ /// for vector loop control flow.

+ bool useActiveLaneMaskForControlFlow() const {

+ return FoldTailByMasking &&

+ TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;

+ }

/// Returns true if the instructions in this block requires predication

/// for any reason, e.g. because tail folding now requires a predicate

/// or because the block in the original loop was predicated.

@@ -1551,14 +1567,14 @@ public:

Scalars.clear();

}

-private:

- unsigned NumPredStores = 0;

/// Convenience function that returns the value of vscale_range iff

/// vscale_range.min == vscale_range.max or otherwise returns the value

/// returned by the corresponding TLI method.

Optional<unsigned> getVScaleForTuning() const;

+private:

+ unsigned NumPredStores = 0;

/// \return An upper bound for the vectorization factors for both

/// fixed and scalable vectorization, where the minimum-known number of

/// elements is a power-of-2 larger than zero. If scalable vectorization is

@@ -1661,7 +1677,8 @@ private:

/// A set containing all BasicBlocks that are known to present after

/// vectorization as a predicated block.

- SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

+ DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>

+ PredicatedBBsAfterVectorization;

/// Records whether it is allowed to have the original scalar loop execute at

/// least once. This may be needed as a fallback loop in case runtime

@@ -1849,14 +1866,17 @@ class GeneratedRTChecks {

DominatorTree *DT;

LoopInfo *LI;

+ TargetTransformInfo *TTI;

SCEVExpander SCEVExp;

SCEVExpander MemCheckExp;

+ bool CostTooHigh = false;

public:

GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,

- const DataLayout &DL)

- : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),

+ TargetTransformInfo *TTI, const DataLayout &DL)

+ : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),

MemCheckExp(SE, DL, "scev.check") {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can

@@ -1867,6 +1887,15 @@ public:

void Create(Loop *L, const LoopAccessInfo &LAI,

const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {

+ // Hard cutoff to limit compile-time increase in case a very large number of

+ // runtime checks needs to be generated.

+ // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to

+ // profile info.

+ CostTooHigh =

+ LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;

+ if (CostTooHigh)

+ return;

BasicBlock *LoopHeader = L->getHeader();

BasicBlock *Preheader = L->getLoopPreheader();

@@ -1938,6 +1967,44 @@ public:

}

+ InstructionCost getCost() {

+ if (SCEVCheckBlock || MemCheckBlock)

+ LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");

+ if (CostTooHigh) {

+ InstructionCost Cost;

+ Cost.setInvalid();

+ LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");

+ return Cost;

+ }

+ InstructionCost RTCheckCost = 0;

+ if (SCEVCheckBlock)

+ for (Instruction &I : *SCEVCheckBlock) {

+ if (SCEVCheckBlock->getTerminator() == &I)

+ continue;

+ InstructionCost C =

+ TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);

+ LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");

+ RTCheckCost += C;

+ }

+ if (MemCheckBlock)

+ for (Instruction &I : *MemCheckBlock) {

+ if (MemCheckBlock->getTerminator() == &I)

+ continue;

+ InstructionCost C =

+ TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);

+ LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");

+ RTCheckCost += C;

+ }

+ if (SCEVCheckBlock || MemCheckBlock)

+ LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost

+ << "\n");

+ return RTCheckCost;

+ }

/// Remove the created SCEV & memory runtime check blocks & instructions, if

/// unused.

~GeneratedRTChecks() {

@@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// If tail is to be folded, vector loop takes care of all iterations.

Type *CountTy = Count->getType();

Value *CheckMinIters = Builder.getFalse();

- Value *Step = createStepForVF(Builder, CountTy, VF, UF);

+ auto CreateStep = [&]() {

+ // Create step with max(MinProTripCount, UF * VF).

+ if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())

+ return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);

+ return createStepForVF(Builder, CountTy, VF, UF);

+ };

if (!Cost->foldTailByMasking())

- CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");

+ CheckMinIters =

+ Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");

else if (VF.isScalable()) {

// vscale is not necessarily a power-of-2, which means we cannot guarantee

// an overflow to zero when updating induction variables and so an

@@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);

// Don't execute the vector loop if (UMax - n) < (VF * UF).

- CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);

+ CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());

}

// Create new preheader for vector loop.

LoopVectorPreHeader =

SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,

@@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

}

BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {

BasicBlock *const SCEVCheckBlock =

RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);

if (!SCEVCheckBlock)

@@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

MaxVScale =

TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

MaxScalableVF = ElementCount::getScalable(

- MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);

+ MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);

if (!MaxScalableVF)

reportVectorizationInfo(

"Max legal vector width too small, scalable vectorization "

@@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable(

unsigned EstimatedWidthB = B.Width.getKnownMinValue();

if (Optional<unsigned> VScale = getVScaleForTuning()) {

if (A.Width.isScalable())

- EstimatedWidthA *= VScale.getValue();

+ EstimatedWidthA *= VScale.value();

if (B.Width.isScalable())

- EstimatedWidthB *= VScale.getValue();

+ EstimatedWidthB *= VScale.value();

}

// Assume vscale may be larger than 1 (or the value being tuned for),

@@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

- auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {

+ const auto &TTICapture = TTI;

+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {

if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))

return 0;

- return TTI.getRegUsageForType(VectorType::get(Ty, VF));

+ return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));

};

for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {

@@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

// map will indicate that we've analyzed it already.

ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

+ PredicatedBBsAfterVectorization[VF].clear();

// Find all the instructions that are scalar with predication in the loop and

// determine if it would be better to not if-convert the blocks they are in.

// If so, we also record the instructions to scalarize.

@@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

computePredInstDiscount(&I, ScalarCosts, VF) >= 0)

ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());

// Remember that BB will remain after vectorization.

- PredicatedBBsAfterVectorization.insert(BB);

+ PredicatedBBsAfterVectorization[VF].insert(BB);

}

@@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

bool ScalarPredicatedBB = false;

BranchInst *BI = cast<BranchInst>(I);

if (VF.isVector() && BI->isConditional() &&

- (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||

- PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))

+ (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||

+ PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))

ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {

@@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

return VectorizationFactor::Disabled();

}

-bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {

- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();

- return (NumRuntimePointerChecks >

- VectorizerParams::RuntimeMemoryCheckThreshold &&

- !Hints.allowReordering()) ||

- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;

Optional<VectorizationFactor>

LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

assert(OrigLoop->isInnermost() && "Inner loop expected.");

@@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

return VectorizationFactor::Disabled();

// Select the optimal vectorization factor.

- return CM.selectVectorizationFactor(VFCandidates);

+ VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);

+ assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");

+ return VF;

}

VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {

@@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();

Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);

if (VectorizedLoopID)

- L->setLoopID(VectorizedLoopID.getValue());

+ L->setLoopID(VectorizedLoopID.value());

else {

// Keep all loop hints from the original loop on the vector loop (we'll

// replace the vectorizer-specific hints below).

@@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {

}

#endif

-void LoopVectorizationPlanner::collectTriviallyDeadInstructions(

- SmallPtrSetImpl<Instruction *> &DeadInstructions) {

- // We create new control-flow for the vectorized loop, so the original exit

- // conditions will be dead after vectorization if it's only used by the

- // terminator

- SmallVector<BasicBlock*> ExitingBlocks;

- OrigLoop->getExitingBlocks(ExitingBlocks);

- for (auto *BB : ExitingBlocks) {

- auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));

- if (!Cmp || !Cmp->hasOneUse())

- continue;

- // TODO: we should introduce a getUniqueExitingBlocks on Loop

- if (!DeadInstructions.insert(Cmp).second)

- continue;

- // The operands of the icmp is often a dead trunc, used by IndUpdate.

- // TODO: can recurse through operands in general

- for (Value *Op : Cmp->operands()) {

- if (isa<TruncInst>(Op) && Op->hasOneUse())

- DeadInstructions.insert(cast<Instruction>(Op));

- }

- // We create new "steps" for induction variable updates to which the original

- // induction variables map. An original update instruction will be dead if

- // all its users except the induction variable are dead.

- auto *Latch = OrigLoop->getLoopLatch();

- for (auto &Induction : Legal->getInductionVars()) {

- PHINode *Ind = Induction.first;

- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

- // If the tail is to be folded by masking, the primary induction variable,

- // if exists, isn't dead: it will be used for masking. Don't kill it.

- if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())

- continue;

- if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {

- return U == Ind || DeadInstructions.count(cast<Instruction>(U));

- }))

- DeadInstructions.insert(IndUpdate);

- }

Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

//===--------------------------------------------------------------------===//

@@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

if (!CM.blockNeedsPredicationForAnyReason(BB))

return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.

+ assert(CM.foldTailByMasking() && "must fold the tail");

+ // If we're using the active lane mask for control flow, then we get the

+ // mask from the active lane mask PHI that is cached in the VPlan.

+ PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();

+ if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)

+ return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();

// Introduce the early-exit compare IV <= BTC to form header block mask.

// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by

// constructing the desired canonical IV in the header block as its first

// non-phi instructions.

- assert(CM.foldTailByMasking() && "must fold the tail");

VPBasicBlock *HeaderVPBB =

Plan->getVectorLoopRegion()->getEntryBasicBlock();

auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();

@@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

VPBuilder::InsertPointGuard Guard(Builder);

Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);

- if (CM.TTI.emitGetActiveLaneMask()) {

+ if (EmitGetActiveLaneMask != PredicationStyle::None) {

VPValue *TC = Plan->getOrCreateTripCount();

- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});

+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},

+ nullptr, "active.lane.mask");

} else {

VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();

BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});

@@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

return RegSucc;

}

-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,

- VPRecipeBase *PredRecipe,

- VPlanPtr &Plan) {

+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(

+ Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {

// Instructions marked for predication are replicated and placed under an

// if-then construct to prevent side-effects.

@@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,

auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);

auto *PHIRecipe = Instr->getType()->isVoidTy()

? nullptr

- : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));

+ : new VPPredInstPHIRecipe(PredRecipe);

if (PHIRecipe) {

Plan->removeVPValueFor(Instr);

Plan->addVPValue(Instr, PHIRecipe);

@@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

ElementCount MaxVF) {

assert(OrigLoop->isInnermost() && "Inner loop expected.");

- // Collect instructions from the original loop that will become trivially dead

- // in the vectorized loop. We don't need to vectorize these instructions. For

- // example, original induction update instructions can become dead because we

- // separately emit induction "steps" when generating code for the new loop.

- // Similarly, we create a new latch condition when setting up the structure

- // of the new loop, so the old one can become dead.

- SmallPtrSet<Instruction *, 4> DeadInstructions;

- collectTriviallyDeadInstructions(DeadInstructions);

// Add assume instructions we need to drop to DeadInstructions, to prevent

// them from being added to the VPlan.

// TODO: We only need to drop assumes in blocks that get flattend. If the

// control flow is preserved, we should keep them.

+ SmallPtrSet<Instruction *, 4> DeadInstructions;

auto &ConditionalAssumes = Legal->getConditionalAssumes();

DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());

@@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

}

-// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a

-// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a

-// BranchOnCount VPInstruction to the latch.

+// Add the necessary canonical IV and branch recipes required to control the

+// loop.

static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

- bool HasNUW) {

+ bool HasNUW,

+ bool UseLaneMaskForLoopControlFlow) {

Value *StartIdx = ConstantInt::get(IdxTy, 0);

auto *StartV = Plan.getOrAddVPValue(StartIdx);

+ // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.

auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);

VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

VPBasicBlock *Header = TopRegion->getEntryBasicBlock();

Header->insert(CanonicalIVPHI, Header->begin());

+ // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar

+ // IV by VF * UF.

auto *CanonicalIVIncrement =

new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW

: VPInstruction::CanonicalIVIncrement,

- {CanonicalIVPHI}, DL);

+ {CanonicalIVPHI}, DL, "index.next");

CanonicalIVPHI->addOperand(CanonicalIVIncrement);

VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

EB->appendRecipe(CanonicalIVIncrement);

- auto *BranchOnCount =

- new VPInstruction(VPInstruction::BranchOnCount,

- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

- EB->appendRecipe(BranchOnCount);

+ if (UseLaneMaskForLoopControlFlow) {

+ // Create the active lane mask instruction in the vplan preheader.

+ VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();

+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since

+ // we have to take unrolling into account. Each part needs to start at

+ // Part * VF

+ auto *CanonicalIVIncrementParts =

+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

+ : VPInstruction::CanonicalIVIncrementForPart,

+ {StartV}, DL, "index.part.next");

+ Preheader->appendRecipe(CanonicalIVIncrementParts);

+ // Create the ActiveLaneMask instruction using the correct start values.

+ VPValue *TC = Plan.getOrCreateTripCount();

+ auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,

+ {CanonicalIVIncrementParts, TC}, DL,

+ "active.lane.mask.entry");

+ Preheader->appendRecipe(EntryALM);

+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the

+ // preheader ActiveLaneMask instruction.

+ auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());

+ Header->insert(LaneMaskPhi, Header->getFirstNonPhi());

+ // Create the active lane mask for the next iteration of the loop.

+ CanonicalIVIncrementParts =

+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

+ : VPInstruction::CanonicalIVIncrementForPart,

+ {CanonicalIVIncrement}, DL);

+ EB->appendRecipe(CanonicalIVIncrementParts);

+ auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,

+ {CanonicalIVIncrementParts, TC}, DL,

+ "active.lane.mask.next");

+ EB->appendRecipe(ALM);

+ LaneMaskPhi->addOperand(ALM);

+ // We have to invert the mask here because a true condition means jumping

+ // to the exit block.

+ auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);

+ EB->appendRecipe(NotMask);

+ VPInstruction *BranchBack =

+ new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);

+ EB->appendRecipe(BranchBack);

+ } else {

+ // Add the BranchOnCount VPInstruction to the latch.

+ VPInstruction *BranchBack = new VPInstruction(

+ VPInstruction::BranchOnCount,

+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

+ EB->appendRecipe(BranchBack);

+ }

}

// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the

@@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),

DLInst ? DLInst->getDebugLoc() : DebugLoc(),

- !CM.foldTailByMasking());

+ !CM.foldTailByMasking(),

+ CM.useActiveLaneMaskForControlFlow());

// Scan the body of the loop in a topological order to visit each basic block

// after having visited its predecessor basic blocks.

@@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());

VPlanTransforms::sinkScalarOperands(*Plan);

- VPlanTransforms::mergeReplicateRegions(*Plan);

VPlanTransforms::removeDeadRecipes(*Plan);

+ VPlanTransforms::mergeReplicateRegions(*Plan);

VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);

// Fold Exit block into its predecessor if possible.

@@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

Term->eraseFromParent();

addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),

- true);

+ true, CM.useActiveLaneMaskForControlFlow());

return Plan;

}

@@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

Plan->removeVPValueFor(R);

Plan->addVPValue(R, RedRecipe);

- WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());

+ // Append the recipe to the end of the VPBasicBlock because we need to

+ // ensure that it comes after all of it's inputs, including CondOp.

+ WidenRecipe->getParent()->appendRecipe(RedRecipe);

WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

WidenRecipe->eraseFromParent();

@@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {

*this, State);

}

-void VPWidenSelectRecipe::execute(VPTransformState &State) {

- auto &I = *cast<SelectInst>(getUnderlyingInstr());

- State.setDebugLocFromInst(&I);

- // The condition can be loop invariant but still defined inside the

- // loop. This means that we can't just use the original 'cond' value.

- // We have to take the 'vectorized' value and pick the first lane.

- // Instcombine will make this a no-op.

- auto *InvarCond =

- InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);

- Value *Op0 = State.get(getOperand(1), Part);

- Value *Op1 = State.get(getOperand(2), Part);

- Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);

- State.set(this, Sel, Part);

- State.addMetadata(Sel, &I);

- }

-void VPWidenRecipe::execute(VPTransformState &State) {

- auto &I = *cast<Instruction>(getUnderlyingValue());

- auto &Builder = State.Builder;

- switch (I.getOpcode()) {

- case Instruction::Call:

- case Instruction::Br:

- case Instruction::PHI:

- case Instruction::GetElementPtr:

- case Instruction::Select:

- llvm_unreachable("This instruction is handled by a different recipe.");

- case Instruction::UDiv:

- case Instruction::SDiv:

- case Instruction::SRem:

- case Instruction::URem:

- case Instruction::Add:

- case Instruction::FAdd:

- case Instruction::Sub:

- case Instruction::FSub:

- case Instruction::FNeg:

- case Instruction::Mul:

- case Instruction::FMul:

- case Instruction::FDiv:

- case Instruction::FRem:

- case Instruction::Shl:

- case Instruction::LShr:

- case Instruction::AShr:

- case Instruction::And:

- case Instruction::Or:

- case Instruction::Xor: {

- // Just widen unops and binops.

- State.setDebugLocFromInst(&I);

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- SmallVector<Value *, 2> Ops;

- for (VPValue *VPOp : operands())

- Ops.push_back(State.get(VPOp, Part));

- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

- if (auto *VecOp = dyn_cast<Instruction>(V)) {

- VecOp->copyIRFlags(&I);

- // If the instruction is vectorized and was in a basic block that needed

- // predication, we can't propagate poison-generating flags (nuw/nsw,

- // exact, etc.). The control flow has been linearized and the

- // instruction is no longer guarded by the predicate, which could make

- // the flag properties to no longer hold.

- if (State.MayGeneratePoisonRecipes.contains(this))

- VecOp->dropPoisonGeneratingFlags();

- }

- // Use this vector value for all users of the original instruction.

- State.set(this, V, Part);

- State.addMetadata(V, &I);

- }

- break;

- }

- case Instruction::Freeze: {

- State.setDebugLocFromInst(&I);

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *Op = State.get(getOperand(0), Part);

- Value *Freeze = Builder.CreateFreeze(Op);

- State.set(this, Freeze, Part);

- }

- break;

- }

- case Instruction::ICmp:

- case Instruction::FCmp: {

- // Widen compares. Generate vector compares.

- bool FCmp = (I.getOpcode() == Instruction::FCmp);

- auto *Cmp = cast<CmpInst>(&I);

- State.setDebugLocFromInst(Cmp);

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *A = State.get(getOperand(0), Part);

- Value *B = State.get(getOperand(1), Part);

- Value *C = nullptr;

- if (FCmp) {

- // Propagate fast math flags.

- IRBuilder<>::FastMathFlagGuard FMFG(Builder);

- Builder.setFastMathFlags(Cmp->getFastMathFlags());

- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);

- } else {

- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);

- }

- State.set(this, C, Part);

- State.addMetadata(C, &I);

- }

- break;

- }

- case Instruction::ZExt:

- case Instruction::SExt:

- case Instruction::FPToUI:

- case Instruction::FPToSI:

- case Instruction::FPExt:

- case Instruction::PtrToInt:

- case Instruction::IntToPtr:

- case Instruction::SIToFP:

- case Instruction::UIToFP:

- case Instruction::Trunc:

- case Instruction::FPTrunc:

- case Instruction::BitCast: {

- auto *CI = cast<CastInst>(&I);

- State.setDebugLocFromInst(CI);

- /// Vectorize casts.

- Type *DestTy = (State.VF.isScalar())

- ? CI->getType()

- : VectorType::get(CI->getType(), State.VF);

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *A = State.get(getOperand(0), Part);

- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);

- State.set(this, Cast, Part);

- State.addMetadata(Cast, &I);

- }

- break;

- }

- default:

- // This instruction is not vectorized by simple widening.

- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);

- llvm_unreachable("Unhandled instruction!");

- } // end of switch.

-void VPWidenGEPRecipe::execute(VPTransformState &State) {

- auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());

- // Construct a vector GEP by widening the operands of the scalar GEP as

- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

- // results in a vector of pointers when at least one operand of the GEP

- // is vector-typed. Thus, to keep the representation compact, we only use

- // vector-typed operands for loop-varying values.

- if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

- // If we are vectorizing, but the GEP has only loop-invariant operands,

- // the GEP we build (by only using vector-typed operands for

- // loop-varying values) would be a scalar pointer. Thus, to ensure we

- // produce a vector of pointers, we need to either arbitrarily pick an

- // operand to broadcast, or broadcast a clone of the original GEP.

- // Here, we broadcast a clone of the original.

- //

- // TODO: If at some point we decide to scalarize instructions having

- // loop-invariant operands, this special case will no longer be

- // required. We would add the scalarization decision to

- // collectLoopScalars() and teach getVectorValue() to broadcast

- // the lane-zero scalar value.

- auto *Clone = State.Builder.Insert(GEP->clone());

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);

- State.set(this, EntryPart, Part);

- State.addMetadata(EntryPart, GEP);

- }

- } else {

- // If the GEP has at least one loop-varying operand, we are sure to

- // produce a vector of pointers. But if we are only unrolling, we want

- // to produce a scalar GEP for each unroll part. Thus, the GEP we

- // produce with the code below will be scalar (if VF == 1) or vector

- // (otherwise). Note that for the unroll-only case, we still maintain

- // values in the vector mapping with initVector, as we do for other

- // instructions.

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- // The pointer operand of the new GEP. If it's loop-invariant, we

- // won't broadcast it.

- auto *Ptr = IsPtrLoopInvariant

- ? State.get(getOperand(0), VPIteration(0, 0))

- : State.get(getOperand(0), Part);

- // Collect all the indices for the new GEP. If any index is

- // loop-invariant, we won't broadcast it.

- SmallVector<Value *, 4> Indices;

- for (unsigned I = 1, E = getNumOperands(); I < E; I++) {

- VPValue *Operand = getOperand(I);

- if (IsIndexLoopInvariant[I - 1])

- Indices.push_back(State.get(Operand, VPIteration(0, 0)));

- else

- Indices.push_back(State.get(Operand, Part));

- }

- // If the GEP instruction is vectorized and was in a basic block that

- // needed predication, we can't propagate the poison-generating 'inbounds'

- // flag. The control flow has been linearized and the GEP is no longer

- // guarded by the predicate, which could make the 'inbounds' properties to

- // no longer hold.

- bool IsInBounds =

- GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;

- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

- // but it should be a vector, otherwise.

- auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,

- Indices, "", IsInBounds);

- assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&

- "NewGEP is not a pointer vector");

- State.set(this, NewGEP, Part);

- State.addMetadata(NewGEP, GEP);

- }

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Int or FP induction being replicated.");

@@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

}

-void VPBlendRecipe::execute(VPTransformState &State) {

- State.setDebugLocFromInst(Phi);

- // We know that all PHIs in non-header blocks are converted into

- // selects, so we don't have to worry about the insertion order and we

- // can just use the builder.

- // At this point we generate the predication tree. There may be

- // duplications since this is a simple recursive scan, but future

- // optimizations will clean it up.

- unsigned NumIncoming = getNumIncomingValues();

- // Generate a sequence of selects of the form:

- // SELECT(Mask3, In3,

- // SELECT(Mask2, In2,

- // SELECT(Mask1, In1,

- // In0)))

- // Note that Mask0 is never used: lanes for which no path reaches this phi and

- // are essentially undef are taken from In0.

- InnerLoopVectorizer::VectorParts Entry(State.UF);

- for (unsigned In = 0; In < NumIncoming; ++In) {

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- // We might have single edge PHIs (blocks) - use an identity

- // 'select' for the first PHI operand.

- Value *In0 = State.get(getIncomingValue(In), Part);

- if (In == 0)

- Entry[Part] = In0; // Initialize with the first incoming value.

- else {

- // Select between the current value and the previous incoming edge

- // based on the incoming mask.

- Value *Cond = State.get(getMask(In), Part);

- Entry[Part] =

- State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");

- }

- for (unsigned Part = 0; Part < State.UF; ++Part)

- State.set(this, Entry[Part], Part);

void VPInterleaveRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Interleave group being replicated.");

State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),

@@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

State);

}

-void VPBranchOnMaskRecipe::execute(VPTransformState &State) {

- assert(State.Instance && "Branch on Mask works only on single instance.");

- unsigned Part = State.Instance->Part;

- unsigned Lane = State.Instance->Lane.getKnownLane();

- Value *ConditionBit = nullptr;

- VPValue *BlockInMask = getMask();

- if (BlockInMask) {

- ConditionBit = State.get(BlockInMask, Part);

- if (ConditionBit->getType()->isVectorTy())

- ConditionBit = State.Builder.CreateExtractElement(

- ConditionBit, State.Builder.getInt32(Lane));

- } else // Block in mask is all-one.

- ConditionBit = State.Builder.getTrue();

- // Replace the temporary unreachable terminator with a new conditional branch,

- // whose two destinations will be set later when they are created.

- auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();

- assert(isa<UnreachableInst>(CurrentTerminator) &&

- "Expected to replace unreachable terminator with conditional branch.");

- auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);

- CondBr->setSuccessor(0, nullptr);

- ReplaceInstWithInst(CurrentTerminator, CondBr);

void VPPredInstPHIRecipe::execute(VPTransformState &State) {

assert(State.Instance && "Predicated instruction PHI works per instance.");

Instruction *ScalarPredInst =

@@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath(

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

// optional argument if we don't need it in the future.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,

- Requirements, ORE);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);

// Get user vectorization factor.

ElementCount UserVF = Hints.getWidth();

@@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath(

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);

{

- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,

+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

F->getParent()->getDataLayout());

- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,

- &CM, BFI, PSI, Checks);

+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,

+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks);

LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""

<< L->getHeader()->getParent()->getName() << "\"\n");

LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);

@@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {

}

+static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

+ VectorizationFactor &VF,

+ Optional<unsigned> VScale, Loop *L,

+ ScalarEvolution &SE) {

+ InstructionCost CheckCost = Checks.getCost();

+ if (!CheckCost.isValid())

+ return false;

+ // When interleaving only scalar and vector cost will be equal, which in turn

+ // would lead to a divide by 0. Fall back to hard threshold.

+ if (VF.Width.isScalar()) {

+ if (CheckCost > VectorizeMemoryCheckThreshold) {

+ LLVM_DEBUG(

+ dbgs()

+ << "LV: Interleaving only is not profitable due to runtime checks\n");

+ return false;

+ }

+ return true;

+ }

+ // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.

+ double ScalarC = *VF.ScalarCost.getValue();

+ if (ScalarC == 0)

+ return true;

+ // First, compute the minimum iteration count required so that the vector

+ // loop outperforms the scalar loop.

+ // The total cost of the scalar loop is

+ // ScalarC * TC

+ // where

+ // * TC is the actual trip count of the loop.

+ // * ScalarC is the cost of a single scalar iteration.

+ //

+ // The total cost of the vector loop is

+ // RtC + VecC * (TC / VF) + EpiC

+ // where

+ // * RtC is the cost of the generated runtime checks

+ // * VecC is the cost of a single vector iteration.

+ // * TC is the actual trip count of the loop

+ // * VF is the vectorization factor

+ // * EpiCost is the cost of the generated epilogue, including the cost

+ // of the remaining scalar operations.

+ //

+ // Vectorization is profitable once the total vector cost is less than the

+ // total scalar cost:

+ // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC

+ //

+ // Now we can compute the minimum required trip count TC as

+ // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC

+ //

+ // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that

+ // the computations are performed on doubles, not integers and the result

+ // is rounded up, hence we get an upper estimate of the TC.

+ unsigned IntVF = VF.Width.getKnownMinValue();

+ if (VF.Width.isScalable()) {

+ unsigned AssumedMinimumVscale = 1;

+ if (VScale)

+ AssumedMinimumVscale = *VScale;

+ IntVF *= AssumedMinimumVscale;

+ }

+ double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;

+ double RtC = *CheckCost.getValue();

+ double MinTC1 = RtC / (ScalarC - VecCOverVF);

+ // Second, compute a minimum iteration count so that the cost of the

+ // runtime checks is only a fraction of the total scalar loop cost. This

+ // adds a loop-dependent bound on the overhead incurred if the runtime

+ // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC

+ // * TC. To bound the runtime check to be a fraction 1/X of the scalar

+ // cost, compute

+ // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC

+ double MinTC2 = RtC * 10 / ScalarC;

+ // Now pick the larger minimum. If it is not a multiple of VF, choose the

+ // next closest multiple of VF. This should partly compensate for ignoring

+ // the epilogue cost.

+ uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));

+ VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));

+ LLVM_DEBUG(

+ dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"

+ << VF.MinProfitableTripCount << "\n");

+ // Skip vectorization if the expected trip count is less than the minimum

+ // required trip count.

+ if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {

+ if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),

+ VF.MinProfitableTripCount)) {

+ LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "

+ "trip count < minimum profitable VF ("

+ << *ExpectedTC << " < " << VF.MinProfitableTripCount

+ << ")\n");

+ return false;

+ }

+ return true;

LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)

: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||

!EnableLoopInterleaving),

@@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

CM.collectElementTypesForWidening();

// Use the planner for vectorization.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,

- Requirements, ORE);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);

// Get user vectorization factor and interleave count.

ElementCount UserVF = Hints.getWidth();

@@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VectorizationFactor VF = VectorizationFactor::Disabled();

unsigned IC = 1;

- GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,

+ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

F->getParent()->getDataLayout());

if (MaybeVF) {

- if (LVP.requiresTooManyRuntimeChecks()) {

+ VF = *MaybeVF;

+ // Select the interleave count.

+ IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());

+ unsigned SelectedIC = std::max(IC, UserIC);

+ // Optimistically generate runtime checks if they are needed. Drop them if

+ // they turn out to not be profitable.

+ if (VF.Width.isVector() || SelectedIC > 1)

+ Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);

+ // Check if it is profitable to vectorize with runtime checks.

+ bool ForceVectorization =

+ Hints.getForce() == LoopVectorizeHints::FK_Enabled;

+ if (!ForceVectorization &&

+ !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,

+ *PSE.getSE())) {

ORE->emit([&]() {

return OptimizationRemarkAnalysisAliasing(

DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

@@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {

Hints.emitRemarkWithHints();

return false;

}

- VF = *MaybeVF;

- // Select the interleave count.

- IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());

- unsigned SelectedIC = std::max(IC, UserIC);

- // Optimistically generate runtime checks if they are needed. Drop them if

- // they turn out to not be profitable.

- if (VF.Width.isVector() || SelectedIC > 1)

- Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);

}

// Identify the diagnostic messages that should be produced.

@@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {

if (!MainILV.areSafetyChecksAdded())

DisableRuntimeUnroll = true;

} else {

- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,

- &LVL, &CM, BFI, PSI, Checks);

+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,

+ VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,

+ PSI, Checks);

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);

LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);

@@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

LLVMLoopVectorizeFollowupEpilogue});

if (RemainderLoopID) {

- L->setLoopID(RemainderLoopID.getValue());

+ L->setLoopID(RemainderLoopID.value());

} else {

if (DisableRuntimeUnroll)

AddRuntimeUnrollDisableMetaData(L);

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 019a09665a67..e136cd9aedac 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -2637,7 +2637,7 @@ private:

AliasCacheKey key = std::make_pair(Inst1, Inst2);

Optional<bool> &result = AliasCache[key];

if (result) {

- return result.getValue();

+ return result.value();

}

bool aliased = true;

if (Loc1.Ptr && isSimple(Inst1))

@@ -4592,7 +4592,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

};

InstructionsState S = getSameOpcode(VL);

- if (Depth == RecursionMaxDepth) {

+ // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

+ // a load), in which case peek through to include it in the tree, without

+ // ballooning over-budget.

+ if (Depth >= RecursionMaxDepth &&

+ !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&

+ VL.size() >= 4 &&

+ (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {

+ return match(I,

+ m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&

+ cast<Instruction>(I)->getOpcode() ==

+ cast<Instruction>(S.MainOp)->getOpcode();

+ })))) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");

if (TryToFindDuplicates(S))

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

@@ -11217,7 +11229,7 @@ public:

return OptimizationRemarkMissed(

SV_NAME, "HorSLPNotBeneficial",

ReducedValsToOps.find(VL[0])->second.front())

- << "Vectorizing horizontal reduction is possible"

+ << "Vectorizing horizontal reduction is possible "

<< "but not beneficial with cost " << ore::NV("Cost", Cost)

<< " and threshold "

<< ore::NV("Threshold", -SLPCostThreshold);

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 97f2b1a93815..c7949c42c03e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

@@ -159,7 +159,8 @@ public:

/// Create a replicating region for instruction \p I that requires

/// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.

- VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,

+ VPRegionBlock *createReplicateRegion(Instruction *I,

+ VPReplicateRecipe *PredRecipe,

VPlanPtr &Plan);

/// Build a VPReplicationRecipe for \p I and enclose it within a Region if it

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4d709097c306..30032dda7f60 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -248,25 +248,27 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {

}

void VPTransformState::setDebugLocFromInst(const Value *V) {

- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {

- const DILocation *DIL = Inst->getDebugLoc();

- // When a FSDiscriminator is enabled, we don't need to add the multiply

- // factors to the discriminators.

- if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&

- !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {

- // FIXME: For scalable vectors, assume vscale=1.

- auto NewDIL =

- DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());

- if (NewDIL)

- Builder.SetCurrentDebugLocation(*NewDIL);

- else

- LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "

- << DIL->getFilename() << " Line: " << DIL->getLine());

- } else

- Builder.SetCurrentDebugLocation(DIL);

- } else

+ const Instruction *Inst = dyn_cast<Instruction>(V);

+ if (!Inst) {

Builder.SetCurrentDebugLocation(DebugLoc());

+ return;

+ }

+ const DILocation *DIL = Inst->getDebugLoc();

+ // When a FSDiscriminator is enabled, we don't need to add the multiply

+ // factors to the discriminators.

+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&

+ !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {

+ // FIXME: For scalable vectors, assume vscale=1.

+ auto NewDIL =

+ DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());

+ if (NewDIL)

+ Builder.SetCurrentDebugLocation(*NewDIL);

+ else

+ LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "

+ << DIL->getFilename() << " Line: " << DIL->getLine());

+ } else

+ Builder.SetCurrentDebugLocation(DIL);

}

BasicBlock *

@@ -566,6 +568,24 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,

}

#endif

+VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {

+ VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();

+ for (VPRecipeBase &R : Header->phis()) {

+ if (isa<VPActiveLaneMaskPHIRecipe>(&R))

+ return cast<VPActiveLaneMaskPHIRecipe>(&R);

+ }

+ return nullptr;

+static bool canSimplifyBranchOnCond(VPInstruction *Term) {

+ VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0));

+ if (!Not || Not->getOpcode() != VPInstruction::Not)

+ return false;

+ VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0));

+ return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask;

void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

Value *CanonicalIVStartValue,

VPTransformState &State,

@@ -573,11 +593,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();

auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());

- // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when

- // preparing to execute the plan for the main vector loop.

- if (!IsEpilogueVectorization && Term &&

- Term->getOpcode() == VPInstruction::BranchOnCount &&

- isa<ConstantInt>(TripCountV)) {

+ // Try to simplify the branch condition if TC <= VF * UF when preparing to

+ // execute the plan for the main vector loop. We only do this if the

+ // terminator is:

+ // 1. BranchOnCount, or

+ // 2. BranchOnCond where the input is Not(ActiveLaneMask).

+ if (!IsEpilogueVectorization && Term && isa<ConstantInt>(TripCountV) &&

+ (Term->getOpcode() == VPInstruction::BranchOnCount ||

+ (Term->getOpcode() == VPInstruction::BranchOnCond &&

+ canSimplifyBranchOnCond(Term)))) {

ConstantInt *C = cast<ConstantInt>(TripCountV);

uint64_t TCVal = C->getZExtValue();

if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {

@@ -697,7 +721,8 @@ void VPlan::execute(VPTransformState *State) {

// generated.

bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||

isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||

- cast<VPReductionPHIRecipe>(PhiR)->isOrdered();

+ (isa<VPReductionPHIRecipe>(PhiR) &&

+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered());

unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;

for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 09da4a545d0d..f009a7ee6b4b 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -784,6 +784,10 @@ public:

ActiveLaneMask,

CanonicalIVIncrement,

CanonicalIVIncrementNUW,

+ // The next two are similar to the above, but instead increment the

+ // canonical IV separately for each unrolled part.

+ CanonicalIVIncrementForPart,

+ CanonicalIVIncrementForPartNUW,

BranchOnCount,

BranchOnCond

};

@@ -794,6 +798,9 @@ private:

FastMathFlags FMF;

DebugLoc DL;

+ /// An optional name that can be used for the generated IR instruction.

+ const std::string Name;

/// Utility method serving execute(): generates a single instance of the

/// modeled instruction.

void generateInstruction(VPTransformState &State, unsigned Part);

@@ -802,14 +809,15 @@ protected:

void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }

public:

- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)

+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,

+ const Twine &Name = "")

: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),

VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),

- DL(DL) {}

+ DL(DL), Name(Name.str()) {}

VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

- DebugLoc DL = {})

- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}

+ DebugLoc DL = {}, const Twine &Name = "")

+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}

/// Method to support type inquiry through isa, cast, and dyn_cast.

static inline bool classof(const VPValue *V) {

@@ -818,7 +826,7 @@ public:

VPInstruction *clone() const {

SmallVector<VPValue *, 2> Operands(operands());

- return new VPInstruction(Opcode, Operands, DL);

+ return new VPInstruction(Opcode, Operands, DL, Name);

}

/// Method to support type inquiry through isa, cast, and dyn_cast.

@@ -897,6 +905,8 @@ public:

case VPInstruction::ActiveLaneMask:

case VPInstruction::CanonicalIVIncrement:

case VPInstruction::CanonicalIVIncrementNUW:

+ case VPInstruction::CanonicalIVIncrementForPart:

+ case VPInstruction::CanonicalIVIncrementForPartNUW:

case VPInstruction::BranchOnCount:

return true;

};

@@ -1125,6 +1135,7 @@ public:

/// Method to support type inquiry through isa, cast, and dyn_cast.

static inline bool classof(const VPRecipeBase *B) {

return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC ||

+ B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC ||

B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC ||

B->getVPDefID() == VPRecipeBase::VPReductionPHISC ||

B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC ||

@@ -1132,6 +1143,7 @@ public:

}

static inline bool classof(const VPValue *V) {

return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC ||

+ V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC ||

V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC ||

V->getVPValueID() == VPValue::VPVReductionPHISC ||

V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC ||

@@ -1861,6 +1873,42 @@ public:

}

};

+/// A recipe for generating the active lane mask for the vector loop that is

+/// used to predicate the vector operations.

+/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and

+/// remove VPActiveLaneMaskPHIRecipe.

+class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {

+ DebugLoc DL;

+public:

+ VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)

+ : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC,

+ VPActiveLaneMaskPHISC, nullptr, StartMask),

+ DL(DL) {}

+ ~VPActiveLaneMaskPHIRecipe() override = default;

+ /// Method to support type inquiry through isa, cast, and dyn_cast.

+ static inline bool classof(const VPDef *D) {

+ return D->getVPDefID() == VPActiveLaneMaskPHISC;

+ }

+ static inline bool classof(const VPHeaderPHIRecipe *D) {

+ return D->getVPDefID() == VPActiveLaneMaskPHISC;

+ }

+ static inline bool classof(const VPValue *V) {

+ return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC;

+ }

+ /// Generate the active lane mask phi of the vector loop.

+ void execute(VPTransformState &State) override;

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+ /// Print the recipe.

+ void print(raw_ostream &O, const Twine &Indent,

+ VPSlotTracker &SlotTracker) const override;

+#endif

+};

/// A Recipe for widening the canonical induction variable of the vector loop.

class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {

public:

@@ -2656,6 +2704,10 @@ public:

return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());

}

+ /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there

+ /// be only one at most. If there isn't one, then return nullptr.

+ VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();

void addLiveOut(PHINode *PN, VPValue *V);

void clearLiveOuts() {

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 92422b17457c..fdd901a4a70d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -26,13 +26,19 @@

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/raw_ostream.h"

+#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

#include <cassert>

using namespace llvm;

+using VectorParts = SmallVector<Value *, 2>;

extern cl::opt<bool> EnableVPlanNativePath;

+#define LV_NAME "loop-vectorize"

+#define DEBUG_TYPE LV_NAME

bool VPRecipeBase::mayWriteToMemory() const {

switch (getVPDefID()) {

case VPWidenMemoryInstructionSC: {

@@ -186,7 +192,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,

if (Instruction::isBinaryOp(getOpcode())) {

Value *A = State.get(getOperand(0), Part);

Value *B = State.get(getOperand(1), Part);

- Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);

+ Value *V =

+ Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);

State.set(this, V, Part);

return;

}

@@ -194,14 +201,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,

switch (getOpcode()) {

case VPInstruction::Not: {

Value *A = State.get(getOperand(0), Part);

- Value *V = Builder.CreateNot(A);

+ Value *V = Builder.CreateNot(A, Name);

State.set(this, V, Part);

break;

}

case VPInstruction::ICmpULE: {

Value *IV = State.get(getOperand(0), Part);

Value *TC = State.get(getOperand(1), Part);

- Value *V = Builder.CreateICmpULE(IV, TC);

+ Value *V = Builder.CreateICmpULE(IV, TC, Name);

State.set(this, V, Part);

break;

}

@@ -209,7 +216,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,

Value *Cond = State.get(getOperand(0), Part);

Value *Op1 = State.get(getOperand(1), Part);

Value *Op2 = State.get(getOperand(2), Part);

- Value *V = Builder.CreateSelect(Cond, Op1, Op2);

+ Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);

State.set(this, V, Part);

break;

}

@@ -223,7 +230,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,

auto *PredTy = VectorType::get(Int1Ty, State.VF);

Instruction *Call = Builder.CreateIntrinsic(

Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},

- {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");

+ {VIVElem0, ScalarTC}, nullptr, Name);

State.set(this, Call, Part);

break;

}

@@ -247,7 +254,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,

State.set(this, PartMinus1, Part);

} else {

Value *V2 = State.get(getOperand(1), Part);

- State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);

+ State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),

+ Part);

}

break;

}

@@ -261,7 +269,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,

// elements) times the unroll factor (num of SIMD instructions).

Value *Step =

createStepForVF(Builder, Phi->getType(), State.VF, State.UF);

- Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);

+ Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);

} else {

Next = State.get(this, 0);

}

@@ -269,6 +277,23 @@ void VPInstruction::generateInstruction(VPTransformState &State,

State.set(this, Next, Part);

break;

}

+ case VPInstruction::CanonicalIVIncrementForPart:

+ case VPInstruction::CanonicalIVIncrementForPartNUW: {

+ bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;

+ auto *IV = State.get(getOperand(0), VPIteration(0, 0));

+ if (Part == 0) {

+ State.set(this, IV, Part);

+ break;

+ }

+ // The canonical IV is incremented by the vectorization factor (num of SIMD

+ // elements) times the unroll part.

+ Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);

+ Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);

+ State.set(this, Next, Part);

+ break;

+ }

case VPInstruction::BranchOnCond: {

if (Part != 0)

break;

@@ -370,6 +395,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

case VPInstruction::BranchOnCond:

O << "branch-on-cond";

break;

+ case VPInstruction::CanonicalIVIncrementForPart:

+ O << "VF * Part + ";

+ break;

+ case VPInstruction::CanonicalIVIncrementForPartNUW:

+ O << "VF * Part +(nuw) ";

+ break;

case VPInstruction::BranchOnCount:

O << "branch-on-count ";

break;

@@ -431,7 +462,158 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,

getOperand(2)->printAsOperand(O, SlotTracker);

O << (InvariantCond ? " (condition is loop invariant)" : "");

}

+#endif

+void VPWidenSelectRecipe::execute(VPTransformState &State) {

+ auto &I = *cast<SelectInst>(getUnderlyingInstr());

+ State.setDebugLocFromInst(&I);

+ // The condition can be loop invariant but still defined inside the

+ // loop. This means that we can't just use the original 'cond' value.

+ // We have to take the 'vectorized' value and pick the first lane.

+ // Instcombine will make this a no-op.

+ auto *InvarCond =

+ InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);

+ Value *Op0 = State.get(getOperand(1), Part);

+ Value *Op1 = State.get(getOperand(2), Part);

+ Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);

+ State.set(this, Sel, Part);

+ State.addMetadata(Sel, &I);

+ }

+void VPWidenRecipe::execute(VPTransformState &State) {

+ auto &I = *cast<Instruction>(getUnderlyingValue());

+ auto &Builder = State.Builder;

+ switch (I.getOpcode()) {

+ case Instruction::Call:

+ case Instruction::Br:

+ case Instruction::PHI:

+ case Instruction::GetElementPtr:

+ case Instruction::Select:

+ llvm_unreachable("This instruction is handled by a different recipe.");

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

+ case Instruction::URem:

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::FNeg:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::FDiv:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor: {

+ // Just widen unops and binops.

+ State.setDebugLocFromInst(&I);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ SmallVector<Value *, 2> Ops;

+ for (VPValue *VPOp : operands())

+ Ops.push_back(State.get(VPOp, Part));

+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

+ if (auto *VecOp = dyn_cast<Instruction>(V)) {

+ VecOp->copyIRFlags(&I);

+ // If the instruction is vectorized and was in a basic block that needed

+ // predication, we can't propagate poison-generating flags (nuw/nsw,

+ // exact, etc.). The control flow has been linearized and the

+ // instruction is no longer guarded by the predicate, which could make

+ // the flag properties to no longer hold.

+ if (State.MayGeneratePoisonRecipes.contains(this))

+ VecOp->dropPoisonGeneratingFlags();

+ }

+ // Use this vector value for all users of the original instruction.

+ State.set(this, V, Part);

+ State.addMetadata(V, &I);

+ }

+ break;

+ }

+ case Instruction::Freeze: {

+ State.setDebugLocFromInst(&I);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *Op = State.get(getOperand(0), Part);

+ Value *Freeze = Builder.CreateFreeze(Op);

+ State.set(this, Freeze, Part);

+ }

+ break;

+ }

+ case Instruction::ICmp:

+ case Instruction::FCmp: {

+ // Widen compares. Generate vector compares.

+ bool FCmp = (I.getOpcode() == Instruction::FCmp);

+ auto *Cmp = cast<CmpInst>(&I);

+ State.setDebugLocFromInst(Cmp);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *A = State.get(getOperand(0), Part);

+ Value *B = State.get(getOperand(1), Part);

+ Value *C = nullptr;

+ if (FCmp) {

+ // Propagate fast math flags.

+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);

+ Builder.setFastMathFlags(Cmp->getFastMathFlags());

+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);

+ } else {

+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);

+ }

+ State.set(this, C, Part);

+ State.addMetadata(C, &I);

+ }

+ break;

+ }

+ case Instruction::ZExt:

+ case Instruction::SExt:

+ case Instruction::FPToUI:

+ case Instruction::FPToSI:

+ case Instruction::FPExt:

+ case Instruction::PtrToInt:

+ case Instruction::IntToPtr:

+ case Instruction::SIToFP:

+ case Instruction::UIToFP:

+ case Instruction::Trunc:

+ case Instruction::FPTrunc:

+ case Instruction::BitCast: {

+ auto *CI = cast<CastInst>(&I);

+ State.setDebugLocFromInst(CI);

+ /// Vectorize casts.

+ Type *DestTy = (State.VF.isScalar())

+ ? CI->getType()

+ : VectorType::get(CI->getType(), State.VF);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *A = State.get(getOperand(0), Part);

+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);

+ State.set(this, Cast, Part);

+ State.addMetadata(Cast, &I);

+ }

+ break;

+ }

+ default:

+ // This instruction is not vectorized by simple widening.

+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);

+ llvm_unreachable("Unhandled instruction!");

+ } // end of switch.

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "WIDEN ";

@@ -487,7 +669,82 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,

O << Indent << "= SCALAR-STEPS ";

printOperands(O, SlotTracker);

}

+#endif

+void VPWidenGEPRecipe::execute(VPTransformState &State) {

+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());

+ // Construct a vector GEP by widening the operands of the scalar GEP as

+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

+ // results in a vector of pointers when at least one operand of the GEP

+ // is vector-typed. Thus, to keep the representation compact, we only use

+ // vector-typed operands for loop-varying values.

+ if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

+ // If we are vectorizing, but the GEP has only loop-invariant operands,

+ // the GEP we build (by only using vector-typed operands for

+ // loop-varying values) would be a scalar pointer. Thus, to ensure we

+ // produce a vector of pointers, we need to either arbitrarily pick an

+ // operand to broadcast, or broadcast a clone of the original GEP.

+ // Here, we broadcast a clone of the original.

+ //

+ // TODO: If at some point we decide to scalarize instructions having

+ // loop-invariant operands, this special case will no longer be

+ // required. We would add the scalarization decision to

+ // collectLoopScalars() and teach getVectorValue() to broadcast

+ // the lane-zero scalar value.

+ auto *Clone = State.Builder.Insert(GEP->clone());

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);

+ State.set(this, EntryPart, Part);

+ State.addMetadata(EntryPart, GEP);

+ }

+ } else {

+ // If the GEP has at least one loop-varying operand, we are sure to

+ // produce a vector of pointers. But if we are only unrolling, we want

+ // to produce a scalar GEP for each unroll part. Thus, the GEP we

+ // produce with the code below will be scalar (if VF == 1) or vector

+ // (otherwise). Note that for the unroll-only case, we still maintain

+ // values in the vector mapping with initVector, as we do for other

+ // instructions.

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ // The pointer operand of the new GEP. If it's loop-invariant, we

+ // won't broadcast it.

+ auto *Ptr = IsPtrLoopInvariant

+ ? State.get(getOperand(0), VPIteration(0, 0))

+ : State.get(getOperand(0), Part);

+ // Collect all the indices for the new GEP. If any index is

+ // loop-invariant, we won't broadcast it.

+ SmallVector<Value *, 4> Indices;

+ for (unsigned I = 1, E = getNumOperands(); I < E; I++) {

+ VPValue *Operand = getOperand(I);

+ if (IsIndexLoopInvariant[I - 1])

+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));

+ else

+ Indices.push_back(State.get(Operand, Part));

+ }

+ // If the GEP instruction is vectorized and was in a basic block that

+ // needed predication, we can't propagate the poison-generating 'inbounds'

+ // flag. The control flow has been linearized and the GEP is no longer

+ // guarded by the predicate, which could make the 'inbounds' properties to

+ // no longer hold.

+ bool IsInBounds =

+ GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;

+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

+ // but it should be a vector, otherwise.

+ auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,

+ Indices, "", IsInBounds);

+ assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&

+ "NewGEP is not a pointer vector");

+ State.set(this, NewGEP, Part);

+ State.addMetadata(NewGEP, GEP);

+ }

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "WIDEN-GEP ";

@@ -501,7 +758,48 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,

O << " = getelementptr ";

printOperands(O, SlotTracker);

}

+#endif

+void VPBlendRecipe::execute(VPTransformState &State) {

+ State.setDebugLocFromInst(Phi);

+ // We know that all PHIs in non-header blocks are converted into

+ // selects, so we don't have to worry about the insertion order and we

+ // can just use the builder.

+ // At this point we generate the predication tree. There may be

+ // duplications since this is a simple recursive scan, but future

+ // optimizations will clean it up.

+ unsigned NumIncoming = getNumIncomingValues();

+ // Generate a sequence of selects of the form:

+ // SELECT(Mask3, In3,

+ // SELECT(Mask2, In2,

+ // SELECT(Mask1, In1,

+ // In0)))

+ // Note that Mask0 is never used: lanes for which no path reaches this phi and

+ // are essentially undef are taken from In0.

+ VectorParts Entry(State.UF);

+ for (unsigned In = 0; In < NumIncoming; ++In) {

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ // We might have single edge PHIs (blocks) - use an identity

+ // 'select' for the first PHI operand.

+ Value *In0 = State.get(getIncomingValue(In), Part);

+ if (In == 0)

+ Entry[Part] = In0; // Initialize with the first incoming value.

+ else {

+ // Select between the current value and the previous incoming edge

+ // based on the incoming mask.

+ Value *Cond = State.get(getMask(In), Part);

+ Entry[Part] =

+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");

+ }

+ for (unsigned Part = 0; Part < State.UF; ++Part)

+ State.set(this, Entry[Part], Part);

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "BLEND ";

@@ -566,7 +864,35 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,

if (AlsoPack)

O << " (S->V)";

}

+#endif

+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {

+ assert(State.Instance && "Branch on Mask works only on single instance.");

+ unsigned Part = State.Instance->Part;

+ unsigned Lane = State.Instance->Lane.getKnownLane();

+ Value *ConditionBit = nullptr;

+ VPValue *BlockInMask = getMask();

+ if (BlockInMask) {

+ ConditionBit = State.get(BlockInMask, Part);

+ if (ConditionBit->getType()->isVectorTy())

+ ConditionBit = State.Builder.CreateExtractElement(

+ ConditionBit, State.Builder.getInt32(Lane));

+ } else // Block in mask is all-one.

+ ConditionBit = State.Builder.getTrue();

+ // Replace the temporary unreachable terminator with a new conditional branch,

+ // whose two destinations will be set later when they are created.

+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();

+ assert(isa<UnreachableInst>(CurrentTerminator) &&

+ "Expected to replace unreachable terminator with conditional branch.");

+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);

+ CondBr->setSuccessor(0, nullptr);

+ ReplaceInstWithInst(CurrentTerminator, CondBr);

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "PHI-PREDICATED-INSTRUCTION ";

@@ -838,3 +1164,28 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,

printOperands(O, SlotTracker);

}

#endif

+// TODO: It would be good to use the existing VPWidenPHIRecipe instead and

+// remove VPActiveLaneMaskPHIRecipe.

+void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {

+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {

+ Value *StartMask = State.get(getOperand(0), Part);

+ PHINode *EntryPart =

+ State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");

+ EntryPart->addIncoming(StartMask, VectorPH);

+ EntryPart->setDebugLoc(DL);

+ State.set(this, EntryPart, Part);

+ }

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,

+ VPSlotTracker &SlotTracker) const {

+ O << Indent << "ACTIVE-LANE-MASK-PHI ";

+ printAsOperand(O, SlotTracker);

+ O << " = phi ";

+ printOperands(O, SlotTracker);

+#endif

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 5fc676834331..c99fae1b2ab4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h

@@ -103,6 +103,7 @@ public:

// Phi-like VPValues. Need to be kept together.

VPVBlendSC,

VPVCanonicalIVPHISC,

+ VPVActiveLaneMaskPHISC,

VPVFirstOrderRecurrencePHISC,

VPVWidenPHISC,

VPVWidenIntOrFpInductionSC,

@@ -358,6 +359,7 @@ public:

// Phi-like recipes. Need to be kept together.

VPBlendSC,

VPCanonicalIVPHISC,

+ VPActiveLaneMaskPHISC,

VPFirstOrderRecurrencePHISC,

VPWidenPHISC,

VPWidenIntOrFpInductionSC,

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index f917883145c0..3501de6ab38e 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

@@ -133,32 +133,48 @@ void VPlanVerifier::verifyHierarchicalCFG(

verifyRegionRec(TopRegion);

}

-bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {

- auto Iter = depth_first(

- VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));

- for (const VPBasicBlock *VPBB :

- VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {

- // Verify that phi-like recipes are at the beginning of the block, with no

- // other recipes in between.

- auto RecipeI = VPBB->begin();

- auto End = VPBB->end();

- while (RecipeI != End && RecipeI->isPhi())

- RecipeI++;

+static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) {

+ // Verify that phi-like recipes are at the beginning of the block, with no

+ // other recipes in between.

+ auto RecipeI = VPBB->begin();

+ auto End = VPBB->end();

+ unsigned NumActiveLaneMaskPhiRecipes = 0;

+ while (RecipeI != End && RecipeI->isPhi()) {

+ if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))

+ NumActiveLaneMaskPhiRecipes++;

+ RecipeI++;

+ }

- while (RecipeI != End) {

- if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {

- errs() << "Found phi-like recipe after non-phi recipe";

+ if (NumActiveLaneMaskPhiRecipes > 1) {

+ errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe";

+ return false;

+ }

+ while (RecipeI != End) {

+ if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {

+ errs() << "Found phi-like recipe after non-phi recipe";

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

- errs() << ": ";

- RecipeI->dump();

- errs() << "after\n";

- std::prev(RecipeI)->dump();

+ errs() << ": ";

+ RecipeI->dump();

+ errs() << "after\n";

+ std::prev(RecipeI)->dump();

#endif

- return false;

- }

- RecipeI++;

+ return false;

}

+ RecipeI++;

+ }

+ return true;

+bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {

+ auto Iter = depth_first(

+ VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));

+ for (const VPBasicBlock *VPBB :

+ VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {

+ if (!verifyVPBasicBlock(VPBB))

+ return false;

}

const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

@@ -181,15 +197,16 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {

}

if (Exiting->empty()) {

- errs() << "VPlan vector loop exiting block must end with BranchOnCount "

- "VPInstruction but is empty\n";

+ errs() << "VPlan vector loop exiting block must end with BranchOnCount or "

+ "BranchOnCond VPInstruction but is empty\n";

return false;

}

auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));

- if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {

- errs() << "VPlan vector loop exit must end with BranchOnCount "

- "VPInstruction\n";

+ if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&

+ LastInst->getOpcode() != VPInstruction::BranchOnCond)) {

+ errs() << "VPlan vector loop exit must end with BranchOnCount or "

+ "BranchOnCond VPInstruction\n";

return false;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 90598937affc..d12624ffb824 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

@@ -414,6 +414,10 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,

static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,

unsigned NewIndex,

IRBuilder<> &Builder) {

+ // Shufflevectors can only be created for fixed-width vectors.

+ if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType()))

+ return nullptr;

// If the extract can be constant-folded, this code is unsimplified. Defer

// to other passes to handle that.

Value *X = ExtElt->getVectorOperand();

@@ -1249,14 +1253,20 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||

VT != Op0->getType())

return false;

- auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));

- auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));

- auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));

- auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));

+ auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));

+ auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));

+ auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));

+ auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));

+ SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});

auto checkSVNonOpUses = [&](Instruction *I) {

if (!I || I->getOperand(0)->getType() != VT)

return true;

- return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });

+ return any_of(I->users(), [&](User *U) {

+ return U != Op0 && U != Op1 &&

+ !(isa<ShuffleVectorInst>(U) &&

+ (InputShuffles.contains(cast<Instruction>(U)) ||

+ isInstructionTriviallyDead(cast<Instruction>(U))));

+ });

};

if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||

checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))

@@ -1271,6 +1281,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

auto *SV = dyn_cast<ShuffleVectorInst>(U);

if (!SV || SV->getType() != VT)

return false;

+ if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||

+ (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))

+ return false;

if (!llvm::is_contained(Shuffles, SV))

Shuffles.push_back(SV);

}

@@ -1283,13 +1296,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

if (FromReduction && Shuffles.size() > 1)

return false;

+ // Add any shuffle uses for the shuffles we have found, to include them in our

+ // cost calculations.

+ if (!FromReduction) {

+ for (ShuffleVectorInst *SV : Shuffles) {

+ for (auto U : SV->users()) {

+ ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);

+ if (SSV && isa<UndefValue>(SSV->getOperand(1)))

+ Shuffles.push_back(SSV);

+ }

// For each of the output shuffles, we try to sort all the first vector

// elements to the beginning, followed by the second array elements at the

// end. If the binops are legalized to smaller vectors, this may reduce total

// number of binops. We compute the ReconstructMask mask needed to convert

// back to the original lane order.

- SmallVector<int> V1, V2;

- SmallVector<SmallVector<int>> ReconstructMasks;

+ SmallVector<std::pair<int, int>> V1, V2;

+ SmallVector<SmallVector<int>> OrigReconstructMasks;

int MaxV1Elt = 0, MaxV2Elt = 0;

unsigned NumElts = VT->getNumElements();

for (ShuffleVectorInst *SVN : Shuffles) {

@@ -1300,6 +1325,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

// case we need to commute the mask).

Value *SVOp0 = SVN->getOperand(0);

Value *SVOp1 = SVN->getOperand(1);

+ if (isa<UndefValue>(SVOp1)) {

+ auto *SSV = cast<ShuffleVectorInst>(SVOp0);

+ SVOp0 = SSV->getOperand(0);

+ SVOp1 = SSV->getOperand(1);

+ for (unsigned I = 0, E = Mask.size(); I != E; I++) {

+ if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size()))

+ return false;

+ Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]);

+ }

if (SVOp0 == Op1 && SVOp1 == Op0) {

std::swap(SVOp0, SVOp1);

ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);

@@ -1316,21 +1351,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

ReconstructMask.push_back(-1);

} else if (Mask[I] < static_cast<int>(NumElts)) {

MaxV1Elt = std::max(MaxV1Elt, Mask[I]);

- auto It = find(V1, Mask[I]);

+ auto It = find_if(V1, [&](const std::pair<int, int> &A) {

+ return Mask[I] == A.first;

+ });

if (It != V1.end())

ReconstructMask.push_back(It - V1.begin());

else {

ReconstructMask.push_back(V1.size());

- V1.push_back(Mask[I]);

+ V1.emplace_back(Mask[I], V1.size());

}

} else {

MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);

- auto It = find(V2, Mask[I] - NumElts);

+ auto It = find_if(V2, [&](const std::pair<int, int> &A) {

+ return Mask[I] - static_cast<int>(NumElts) == A.first;

+ });

if (It != V2.end())

ReconstructMask.push_back(NumElts + It - V2.begin());

else {

ReconstructMask.push_back(NumElts + V2.size());

- V2.push_back(Mask[I] - NumElts);

+ V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());

}

@@ -1339,7 +1378,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

// result. In-order can help simplify the shuffle away.

if (FromReduction)

sort(ReconstructMask);

- ReconstructMasks.push_back(ReconstructMask);

+ OrigReconstructMasks.push_back(std::move(ReconstructMask));

}

// If the Maximum element used from V1 and V2 are not larger than the new

@@ -1351,16 +1390,68 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

MaxV2Elt == static_cast<int>(V2.size()) - 1))

return false;

+ // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a

+ // shuffle of another shuffle, or not a shuffle (that is treated like a

+ // identity shuffle).

+ auto GetBaseMaskValue = [&](Instruction *I, int M) {

+ auto *SV = dyn_cast<ShuffleVectorInst>(I);

+ if (!SV)

+ return M;

+ if (isa<UndefValue>(SV->getOperand(1)))

+ if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))

+ if (InputShuffles.contains(SSV))

+ return SSV->getMaskValue(SV->getMaskValue(M));

+ return SV->getMaskValue(M);

+ };

+ // Attempt to sort the inputs my ascending mask values to make simpler input

+ // shuffles and push complex shuffles down to the uses. We sort on the first

+ // of the two input shuffle orders, to try and get at least one input into a

+ // nice order.

+ auto SortBase = [&](Instruction *A, std::pair<int, int> X,

+ std::pair<int, int> Y) {

+ int MXA = GetBaseMaskValue(A, X.first);

+ int MYA = GetBaseMaskValue(A, Y.first);

+ return MXA < MYA;

+ };

+ stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {

+ return SortBase(SVI0A, A, B);

+ });

+ stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {

+ return SortBase(SVI1A, A, B);

+ });

+ // Calculate our ReconstructMasks from the OrigReconstructMasks and the

+ // modified order of the input shuffles.

+ SmallVector<SmallVector<int>> ReconstructMasks;

+ for (auto Mask : OrigReconstructMasks) {

+ SmallVector<int> ReconstructMask;

+ for (int M : Mask) {

+ auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {

+ auto It = find_if(V, [M](auto A) { return A.second == M; });

+ assert(It != V.end() && "Expected all entries in Mask");

+ return std::distance(V.begin(), It);

+ };

+ if (M < 0)

+ ReconstructMask.push_back(-1);

+ else if (M < static_cast<int>(NumElts)) {

+ ReconstructMask.push_back(FindIndex(V1, M));

+ } else {

+ ReconstructMask.push_back(NumElts + FindIndex(V2, M));

+ }

+ ReconstructMasks.push_back(std::move(ReconstructMask));

+ }

// Calculate the masks needed for the new input shuffles, which get padded

// with undef

SmallVector<int> V1A, V1B, V2A, V2B;

for (unsigned I = 0; I < V1.size(); I++) {

- V1A.push_back(SVI0A->getMaskValue(V1[I]));

- V1B.push_back(SVI0B->getMaskValue(V1[I]));

+ V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));

+ V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));

}

for (unsigned I = 0; I < V2.size(); I++) {

- V2A.push_back(SVI1A->getMaskValue(V2[I]));

- V2B.push_back(SVI1B->getMaskValue(V2[I]));

+ V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));

+ V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));

}

while (V1A.size() < NumElts) {

V1A.push_back(UndefMaskElem);

@@ -1371,9 +1462,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

V2B.push_back(UndefMaskElem);

}

- auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {

- return C +

- TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());

+ auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {

+ auto *SV = dyn_cast<ShuffleVectorInst>(I);

+ if (!SV)

+ return C;

+ return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))

+ ? TTI::SK_PermuteSingleSrc

+ : TTI::SK_PermuteTwoSrc,

+ VT, SV->getShuffleMask());

};

auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {

return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);

@@ -1386,9 +1482,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);

CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),

InstructionCost(0), AddShuffleCost);

- // This set helps us only cost each unique shuffle once.

- SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(

- {SVI0A, SVI0B, SVI1A, SVI1B});

CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),

InstructionCost(0), AddShuffleCost);

@@ -1408,22 +1501,35 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),

InstructionCost(0), AddShuffleMaskCost);

+ LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");

+ LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore

+ << " vs CostAfter: " << CostAfter << "\n");

if (CostBefore <= CostAfter)

return false;

// The cost model has passed, create the new instructions.

- Builder.SetInsertPoint(SVI0A);

- Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),

- SVI0A->getOperand(1), V1A);

- Builder.SetInsertPoint(SVI0B);

- Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),

- SVI0B->getOperand(1), V1B);

- Builder.SetInsertPoint(SVI1A);

- Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),

- SVI1A->getOperand(1), V2A);

- Builder.SetInsertPoint(SVI1B);

- Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),

- SVI1B->getOperand(1), V2B);

+ auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {

+ auto *SV = dyn_cast<ShuffleVectorInst>(I);

+ if (!SV)

+ return I;

+ if (isa<UndefValue>(SV->getOperand(1)))

+ if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))

+ if (InputShuffles.contains(SSV))

+ return SSV->getOperand(Op);

+ return SV->getOperand(Op);

+ };

+ Builder.SetInsertPoint(SVI0A->getNextNode());

+ Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),

+ GetShuffleOperand(SVI0A, 1), V1A);

+ Builder.SetInsertPoint(SVI0B->getNextNode());

+ Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),

+ GetShuffleOperand(SVI0B, 1), V1B);

+ Builder.SetInsertPoint(SVI1A->getNextNode());

+ Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),

+ GetShuffleOperand(SVI1A, 1), V2A);

+ Builder.SetInsertPoint(SVI1B->getNextNode());

+ Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),

+ GetShuffleOperand(SVI1B, 1), V2B);

Builder.SetInsertPoint(Op0);

Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),

NSV0A, NSV0B);