src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-12-18 20:30:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2024-04-06 20:11:55 +0000
commit	5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree	1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent	3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent	312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

2062

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

4231

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

230

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

585

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

237

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

257

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

575

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

484

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

270

16 files changed, 5822 insertions, 3382 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 260d7889906b..fa2459d1ca02 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -103,13 +103,11 @@

#include "llvm/Support/ModRef.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/Local.h"

-#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <cstdlib>

#include <iterator>

-#include <limits>

#include <numeric>

#include <optional>

#include <tuple>

@@ -900,9 +898,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {

// Chain is in offset order, so C[0] is the instr with the lowest offset,

// i.e. the root of the vector.

- Value *Bitcast = Builder.CreateBitCast(

- getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));

- VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);

+ VecInst = Builder.CreateAlignedLoad(VecTy,

+ getLoadStorePointerOperand(C[0].Inst),

+ Alignment);

unsigned VecIdx = 0;

for (const ChainElem &E : C) {

@@ -976,8 +974,7 @@ bool Vectorizer::vectorizeChain(Chain &C) {

// i.e. the root of the vector.

VecInst = Builder.CreateAlignedStore(

Vec,

- Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),

- VecTy->getPointerTo(AS)),

+ getLoadStorePointerOperand(C[0].Inst),

Alignment);

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f923f0be6621..37a356c43e29 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -289,7 +289,7 @@ void LoopVectorizeHints::getHintsFromMetadata() {

}

void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {

- if (!Name.startswith(Prefix()))

+ if (!Name.starts_with(Prefix()))

return;

Name = Name.substr(Prefix().size(), StringRef::npos);

@@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

}

+ // If we found a vectorized variant of a function, note that so LV can

+ // make better decisions about maximum VF.

+ if (CI && !VFDatabase::getMappings(*CI).empty())

+ VecCallVariantsFound = true;

// Check that the instruction return type is vectorizable.

// Also, we can't vectorize extractelement instructions.

if ((!VectorType::isValidElementType(I.getType()) &&

@@ -1242,13 +1247,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {

bool LoopVectorizationLegality::blockCanBePredicated(

BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,

- SmallPtrSetImpl<const Instruction *> &MaskedOp,

- SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {

+ SmallPtrSetImpl<const Instruction *> &MaskedOp) const {

for (Instruction &I : *BB) {

// We can predicate blocks with calls to assume, as long as we drop them in

// case we flatten the CFG via predication.

if (match(&I, m_Intrinsic<Intrinsic::assume>())) {

- ConditionalAssumes.insert(&I);

+ MaskedOp.insert(&I);

continue;

}

@@ -1345,16 +1349,13 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

}

// We must be able to predicate all blocks that need to be predicated.

- if (blockNeedsPredication(BB)) {

- if (!blockCanBePredicated(BB, SafePointers, MaskedOp,

- ConditionalAssumes)) {

- reportVectorizationFailure(

- "Control flow cannot be substituted for a select",

- "control flow cannot be substituted for a select",

- "NoCFGForSelect", ORE, TheLoop,

- BB->getTerminator());

- return false;

- }

+ if (blockNeedsPredication(BB) &&

+ !blockCanBePredicated(BB, SafePointers, MaskedOp)) {

+ reportVectorizationFailure(

+ "Control flow cannot be substituted for a select",

+ "control flow cannot be substituted for a select", "NoCFGForSelect",

+ ORE, TheLoop, BB->getTerminator());

+ return false;

}

@@ -1554,14 +1555,14 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {

// The list of pointers that we can safely read and write to remains empty.

SmallPtrSet<Value *, 8> SafePointers;

+ // Collect masked ops in temporary set first to avoid partially populating

+ // MaskedOp if a block cannot be predicated.

SmallPtrSet<const Instruction *, 8> TmpMaskedOp;

- SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;

// Check and mark all blocks for predication, including those that ordinarily

// do not need predication such as the header block.

for (BasicBlock *BB : TheLoop->blocks()) {

- if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,

- TmpConditionalAssumes)) {

+ if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {

LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");

return false;

}

@@ -1570,9 +1571,6 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {

LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");

MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());

- ConditionalAssumes.insert(TmpConditionalAssumes.begin(),

- TmpConditionalAssumes.end());

return true;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 13357cb06c55..577ce8000de2 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -31,6 +31,7 @@

namespace llvm {

class LoopInfo;

+class DominatorTree;

class LoopVectorizationLegality;

class LoopVectorizationCostModel;

class PredicatedScalarEvolution;

@@ -45,13 +46,17 @@ class VPBuilder {

VPBasicBlock *BB = nullptr;

VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();

+ /// Insert \p VPI in BB at InsertPt if BB is set.

+ VPInstruction *tryInsertInstruction(VPInstruction *VPI) {

+ if (BB)

+ BB->insert(VPI, InsertPt);

+ return VPI;

+ }

VPInstruction *createInstruction(unsigned Opcode,

ArrayRef<VPValue *> Operands, DebugLoc DL,

const Twine &Name = "") {

- VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name);

- if (BB)

- BB->insert(Instr, InsertPt);

- return Instr;

+ return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name));

}

VPInstruction *createInstruction(unsigned Opcode,

@@ -62,6 +67,7 @@ class VPBuilder {

public:

VPBuilder() = default;

+ VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }

/// Clear the insertion point: created instructions will not be inserted into

/// a block.

@@ -116,10 +122,11 @@ public:

InsertPt = IP;

}

- /// Insert and return the specified instruction.

- VPInstruction *insert(VPInstruction *I) const {

- BB->insert(I, InsertPt);

- return I;

+ /// This specifies that created instructions should be inserted at the

+ /// specified point.

+ void setInsertPoint(VPRecipeBase *IP) {

+ BB = IP->getParent();

+ InsertPt = IP->getIterator();

}

/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as

@@ -138,6 +145,13 @@ public:

return createInstruction(Opcode, Operands, DL, Name);

}

+ VPInstruction *createOverflowingOp(unsigned Opcode,

+ std::initializer_list<VPValue *> Operands,

+ VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,

+ DebugLoc DL, const Twine &Name = "") {

+ return tryInsertInstruction(

+ new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));

+ }

VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {

return createInstruction(VPInstruction::Not, {Operand}, DL, Name);

}

@@ -158,6 +172,12 @@ public:

Name);

}

+ /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A

+ /// and \p B.

+ /// TODO: add createFCmp when needed.

+ VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,

+ DebugLoc DL = {}, const Twine &Name = "");

//===--------------------------------------------------------------------===//

// RAII helpers.

//===--------------------------------------------------------------------===//

@@ -268,6 +288,9 @@ class LoopVectorizationPlanner {

/// Loop Info analysis.

LoopInfo *LI;

+ /// The dominator tree.

+ DominatorTree *DT;

/// Target Library Info.

const TargetLibraryInfo *TLI;

@@ -298,16 +321,14 @@ class LoopVectorizationPlanner {

VPBuilder Builder;

public:

- LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,

- const TargetTransformInfo &TTI,

- LoopVectorizationLegality *Legal,

- LoopVectorizationCostModel &CM,

- InterleavedAccessInfo &IAI,

- PredicatedScalarEvolution &PSE,

- const LoopVectorizeHints &Hints,

- OptimizationRemarkEmitter *ORE)

- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),

- PSE(PSE), Hints(Hints), ORE(ORE) {}

+ LoopVectorizationPlanner(

+ Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,

+ const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,

+ LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI,

+ PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,

+ OptimizationRemarkEmitter *ORE)

+ : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),

+ IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}

/// Plan how to best vectorize, return the best VF and its cost, or

/// std::nullopt if vectorization and interleaving should be avoided up front.

@@ -333,7 +354,7 @@ public:

executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,

InnerLoopVectorizer &LB, DominatorTree *DT,

bool IsEpilogueVectorization,

- DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);

+ const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void printPlans(raw_ostream &O);

@@ -377,8 +398,7 @@ private:

/// returned VPlan is valid for. If no VPlan can be built for the input range,

/// set the largest included VF to the maximum VF for which no plan could be

/// built.

- std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes(

- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions);

+ VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,

/// according to the information gathered by Legal when it checked if it is

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b603bbe55dc9..f82e161fb846 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -27,7 +27,7 @@

// There is a development effort going on to migrate loop vectorizer to the

// VPlan infrastructure and to introduce outer loop vectorization support (see

-// docs/Proposal/VectorizationPlan.rst and

+// docs/VectorizationPlan.rst and

// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this

// purpose, we temporarily introduced the VPlan-native vectorization path: an

// alternative vectorization path that is natively implemented on top of the

@@ -57,6 +57,7 @@

#include "LoopVectorizationPlanner.h"

#include "VPRecipeBuilder.h"

#include "VPlan.h"

+#include "VPlanAnalysis.h"

#include "VPlanHCFGBuilder.h"

#include "VPlanTransforms.h"

#include "llvm/ADT/APInt.h"

@@ -111,10 +112,12 @@

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

+#include "llvm/IR/MDBuilder.h"

#include "llvm/IR/Metadata.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/Operator.h"

#include "llvm/IR/PatternMatch.h"

+#include "llvm/IR/ProfDataUtils.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Use.h"

#include "llvm/IR/User.h"

@@ -390,6 +393,21 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(

cl::desc(

"Override cost based safe divisor widening for div/rem instructions"));

+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(

+ "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),

+ cl::Hidden,

+ cl::desc("Try wider VFs if they enable the use of vector variants"));

+// Likelyhood of bypassing the vectorized loop because assumptions about SCEV

+// variables not overflowing do not hold. See `emitSCEVChecks`.

+static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};

+// Likelyhood of bypassing the vectorized loop because pointers overlap. See

+// `emitMemRuntimeChecks`.

+static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};

+// Likelyhood of bypassing the vectorized loop because there are zero trips left

+// after prolog. See `emitIterationCountCheck`.

+static constexpr uint32_t MinItersBypassWeights[] = {1, 127};

/// A helper function that returns true if the given type is irregular. The

/// type is irregular if its allocated size doesn't equal the store size of an

/// element of the corresponding vector type.

@@ -408,13 +426,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {

/// we always assume predicated blocks have a 50% chance of executing.

static unsigned getReciprocalPredBlockProb() { return 2; }

-/// A helper function that returns an integer or floating-point constant with

-/// value C.

-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {

- return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)

- : ConstantFP::get(Ty, C);

/// Returns "best known" trip count for the specified loop \p L as defined by

/// the following procedure:

/// 1) Returns exact trip count if it is known.

@@ -556,10 +567,6 @@ public:

const VPIteration &Instance,

VPTransformState &State);

- /// Construct the vector value of a scalarized value \p V one lane at a time.

- void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,

- VPTransformState &State);

/// Try to vectorize interleaved access group \p Group with the base address

/// given in \p Addr, optionally masking the vector operations if \p

/// BlockInMask is non-null. Use \p State to translate given VPValues to IR

@@ -634,10 +641,6 @@ protected:

/// the block that was created for it.

void sinkScalarOperands(Instruction *PredInst);

- /// Shrinks vector element sizes to the smallest bitwidth they can be legally

- /// represented as.

- void truncateToMinimalBitwidths(VPTransformState &State);

/// Returns (and creates if needed) the trip count of the widened loop.

Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

@@ -943,21 +946,21 @@ protected:

/// Look for a meaningful debug location on the instruction or it's

/// operands.

-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {

+static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {

if (!I)

- return I;

+ return DebugLoc();

DebugLoc Empty;

if (I->getDebugLoc() != Empty)

- return I;

+ return I->getDebugLoc();

for (Use &Op : I->operands()) {

if (Instruction *OpInst = dyn_cast<Instruction>(Op))

if (OpInst->getDebugLoc() != Empty)

- return OpInst;

+ return OpInst->getDebugLoc();

}

- return I;

+ return I->getDebugLoc();

}

/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I

@@ -1021,14 +1024,6 @@ const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,

return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);

}

-static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,

- ElementCount VF) {

- assert(FTy->isFloatingPointTy() && "Expected floating point type!");

- Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());

- Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);

- return B.CreateUIToFP(RuntimeVF, FTy);

void reportVectorizationFailure(const StringRef DebugMsg,

const StringRef OREMsg, const StringRef ORETag,

OptimizationRemarkEmitter *ORE, Loop *TheLoop,

@@ -1050,6 +1045,23 @@ void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,

<< Msg);

}

+/// Report successful vectorization of the loop. In case an outer loop is

+/// vectorized, prepend "outer" to the vectorization remark.

+static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,

+ VectorizationFactor VF, unsigned IC) {

+ LLVM_DEBUG(debugVectorizationMessage(

+ "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",

+ nullptr));

+ StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";

+ ORE->emit([&]() {

+ return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),

+ TheLoop->getHeader())

+ << "vectorized " << LoopType << "loop (vectorization width: "

+ << ore::NV("VectorizationFactor", VF.Width)

+ << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";

+ });

} // end namespace llvm

#ifndef NDEBUG

@@ -1104,7 +1116,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {

RecWithFlags->dropPoisonGeneratingFlags();

} else {

- Instruction *Instr = CurRec->getUnderlyingInstr();

+ Instruction *Instr = dyn_cast_or_null<Instruction>(

+ CurRec->getVPSingleValue()->getUnderlyingValue());

(void)Instr;

assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&

"found instruction with poison generating flags not covered by "

@@ -1247,6 +1260,13 @@ public:

/// avoid redundant calculations.

void setCostBasedWideningDecision(ElementCount VF);

+ /// A call may be vectorized in different ways depending on whether we have

+ /// vectorized variants available and whether the target supports masking.

+ /// This function analyzes all calls in the function at the supplied VF,

+ /// makes a decision based on the costs of available options, and stores that

+ /// decision in a map for use in planning and plan execution.

+ void setVectorizedCallDecision(ElementCount VF);

/// A struct that represents some properties of the register usage

/// of a loop.

struct RegisterUsage {

@@ -1270,7 +1290,7 @@ public:

void collectElementTypesForWidening();

/// Split reductions into those that happen in the loop, and those that happen

- /// outside. In loop reductions are collected into InLoopReductionChains.

+ /// outside. In loop reductions are collected into InLoopReductions.

void collectInLoopReductions();

/// Returns true if we should use strict in-order reductions for the given

@@ -1358,7 +1378,9 @@ public:

CM_Widen_Reverse, // For consecutive accesses with stride -1.

CM_Interleave,

CM_GatherScatter,

- CM_Scalarize

+ CM_Scalarize,

+ CM_VectorCall,

+ CM_IntrinsicCall

};

/// Save vectorization decision \p W and \p Cost taken by the cost model for

@@ -1414,6 +1436,29 @@ public:

return WideningDecisions[InstOnVF].second;

}

+ struct CallWideningDecision {

+ InstWidening Kind;

+ Function *Variant;

+ Intrinsic::ID IID;

+ std::optional<unsigned> MaskPos;

+ InstructionCost Cost;

+ };

+ void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,

+ Function *Variant, Intrinsic::ID IID,

+ std::optional<unsigned> MaskPos,

+ InstructionCost Cost) {

+ assert(!VF.isScalar() && "Expected vector VF");

+ CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,

+ MaskPos, Cost};

+ }

+ CallWideningDecision getCallWideningDecision(CallInst *CI,

+ ElementCount VF) const {

+ assert(!VF.isScalar() && "Expected vector VF");

+ return CallWideningDecisions.at(std::make_pair(CI, VF));

+ }

/// Return True if instruction \p I is an optimizable truncate whose operand

/// is an induction variable. Such a truncate will be removed by adding a new

/// induction variable with the destination type.

@@ -1447,11 +1492,15 @@ public:

/// Collect Uniform and Scalar values for the given \p VF.

/// The sets depend on CM decision for Load/Store instructions

/// that may be vectorized as interleave, gather-scatter or scalarized.

+ /// Also make a decision on what to do about call instructions in the loop

+ /// at that VF -- scalarize, call a known vector routine, or call a

+ /// vector intrinsic.

void collectUniformsAndScalars(ElementCount VF) {

// Do the analysis once.

if (VF.isScalar() || Uniforms.contains(VF))

return;

setCostBasedWideningDecision(VF);

+ setVectorizedCallDecision(VF);

collectLoopUniforms(VF);

collectLoopScalars(VF);

}

@@ -1606,20 +1655,9 @@ public:

return foldTailByMasking() || Legal->blockNeedsPredication(BB);

}

- /// A SmallMapVector to store the InLoop reduction op chains, mapping phi

- /// nodes to the chain of instructions representing the reductions. Uses a

- /// MapVector to ensure deterministic iteration order.

- using ReductionChainMap =

- SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;

- /// Return the chain of instructions representing an inloop reduction.

- const ReductionChainMap &getInLoopReductionChains() const {

- return InLoopReductionChains;

- }

/// Returns true if the Phi is part of an inloop reduction.

bool isInLoopReduction(PHINode *Phi) const {

- return InLoopReductionChains.count(Phi);

+ return InLoopReductions.contains(Phi);

}

/// Estimate cost of an intrinsic call instruction CI if it were vectorized

@@ -1629,16 +1667,13 @@ public:

/// Estimate cost of a call instruction CI if it were vectorized with factor

/// VF. Return the cost of the instruction, including scalarization overhead

- /// if it's needed. The flag NeedToScalarize shows if the call needs to be

- /// scalarized -

- /// i.e. either vector version isn't available, or is too expensive.

- InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,

- Function **Variant,

- bool *NeedsMask = nullptr) const;

+ /// if it's needed.

+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;

/// Invalidates decisions already taken by the cost model.

void invalidateCostModelingDecisions() {

WideningDecisions.clear();

+ CallWideningDecisions.clear();

Uniforms.clear();

Scalars.clear();

}

@@ -1675,14 +1710,14 @@ private:

/// elements is a power-of-2 larger than zero. If scalable vectorization is

/// disabled or unsupported, then the scalable part will be equal to

/// ElementCount::getScalable(0).

- FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,

+ FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,

ElementCount UserVF,

bool FoldTailByMasking);

/// \return the maximized element count based on the targets vector

/// registers and the loop trip-count, but limited to a maximum safe VF.

/// This is a helper function of computeFeasibleMaxVF.

- ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,

+ ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,

unsigned SmallestType,

unsigned WidestType,

ElementCount MaxSafeVF,

@@ -1705,7 +1740,7 @@ private:

/// part of that pattern.

std::optional<InstructionCost>

getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,

- TTI::TargetCostKind CostKind);

+ TTI::TargetCostKind CostKind) const;

/// Calculate vectorization cost of memory instruction \p I.

InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);

@@ -1783,15 +1818,12 @@ private:

/// scalarized.

DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;

- /// PHINodes of the reductions that should be expanded in-loop along with

- /// their associated chains of reduction operations, in program order from top

- /// (PHI) to bottom

- ReductionChainMap InLoopReductionChains;

+ /// PHINodes of the reductions that should be expanded in-loop.

+ SmallPtrSet<PHINode *, 4> InLoopReductions;

/// A Map of inloop reduction operations and their immediate chain operand.

/// FIXME: This can be removed once reductions can be costed correctly in

- /// vplan. This was added to allow quick lookup to the inloop operations,

- /// without having to loop through InLoopReductionChains.

+ /// VPlan. This was added to allow quick lookup of the inloop operations.

DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;

/// Returns the expected difference in cost from scalarizing the expression

@@ -1830,6 +1862,11 @@ private:

DecisionList WideningDecisions;

+ using CallDecisionList =

+ DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;

+ CallDecisionList CallWideningDecisions;

/// Returns true if \p V is expected to be vectorized and it needs to be

/// extracted.

bool needsExtract(Value *V, ElementCount VF) const {

@@ -1933,12 +1970,14 @@ class GeneratedRTChecks {

SCEVExpander MemCheckExp;

bool CostTooHigh = false;

+ const bool AddBranchWeights;

public:

GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,

- TargetTransformInfo *TTI, const DataLayout &DL)

+ TargetTransformInfo *TTI, const DataLayout &DL,

+ bool AddBranchWeights)

: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),

- MemCheckExp(SE, DL, "scev.check") {}

+ MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can

/// accurately estimate the cost of the runtime checks. The blocks are

@@ -1990,9 +2029,9 @@ public:

IC);

} else {

- MemRuntimeCheckCond =

- addRuntimeChecks(MemCheckBlock->getTerminator(), L,

- RtPtrChecking.getChecks(), MemCheckExp);

+ MemRuntimeCheckCond = addRuntimeChecks(

+ MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),

+ MemCheckExp, VectorizerParams::HoistRuntimeChecks);

}

assert(MemRuntimeCheckCond &&

"no RT checks generated although RtPtrChecking "

@@ -2131,8 +2170,10 @@ public:

DT->addNewBlock(SCEVCheckBlock, Pred);

DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);

- ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),

- BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));

+ BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);

+ if (AddBranchWeights)

+ setBranchWeights(BI, SCEVCheckBypassWeights);

+ ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);

return SCEVCheckBlock;

}

@@ -2156,9 +2197,12 @@ public:

if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))

PL->addBasicBlockToLoop(MemCheckBlock, *LI);

- ReplaceInstWithInst(

- MemCheckBlock->getTerminator(),

- BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));

+ BranchInst &BI =

+ *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);

+ if (AddBranchWeights) {

+ setBranchWeights(BI, MemCheckBypassWeights);

+ }

+ ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);

MemCheckBlock->getTerminator()->setDebugLoc(

Pred->getTerminator()->getDebugLoc());

@@ -2252,157 +2296,17 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,

// LoopVectorizationCostModel and LoopVectorizationPlanner.

//===----------------------------------------------------------------------===//

-/// This function adds

-/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)

-/// to each vector element of Val. The sequence starts at StartIndex.

-/// \p Opcode is relevant for FP induction variable.

-static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,

- Instruction::BinaryOps BinOp, ElementCount VF,

- IRBuilderBase &Builder) {

- assert(VF.isVector() && "only vector VFs are supported");

- // Create and check the types.

- auto *ValVTy = cast<VectorType>(Val->getType());

- ElementCount VLen = ValVTy->getElementCount();

- Type *STy = Val->getType()->getScalarType();

- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&

- "Induction Step must be an integer or FP");

- assert(Step->getType() == STy && "Step has wrong type");

- SmallVector<Constant *, 8> Indices;

- // Create a vector of consecutive numbers from zero to VF.

- VectorType *InitVecValVTy = ValVTy;

- if (STy->isFloatingPointTy()) {

- Type *InitVecValSTy =

- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());

- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);

- }

- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);

- // Splat the StartIdx

- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);

- if (STy->isIntegerTy()) {

- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);

- Step = Builder.CreateVectorSplat(VLen, Step);

- assert(Step->getType() == Val->getType() && "Invalid step vec");

- // FIXME: The newly created binary instructions should contain nsw/nuw

- // flags, which can be found from the original scalar operations.

- Step = Builder.CreateMul(InitVec, Step);

- return Builder.CreateAdd(Val, Step, "induction");

- }

- // Floating point induction.

- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&

- "Binary Opcode should be specified for FP induction");

- InitVec = Builder.CreateUIToFP(InitVec, ValVTy);

- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);

- Step = Builder.CreateVectorSplat(VLen, Step);

- Value *MulOp = Builder.CreateFMul(InitVec, Step);

- return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");

-/// Compute scalar induction steps. \p ScalarIV is the scalar induction

-/// variable on which to base the steps, \p Step is the size of the step.

-static void buildScalarSteps(Value *ScalarIV, Value *Step,

- const InductionDescriptor &ID, VPValue *Def,

- VPTransformState &State) {

- IRBuilderBase &Builder = State.Builder;

- // Ensure step has the same type as that of scalar IV.

- Type *ScalarIVTy = ScalarIV->getType()->getScalarType();

- if (ScalarIVTy != Step->getType()) {

- // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to

- // avoid separate truncate here.

- assert(Step->getType()->isIntegerTy() &&

- "Truncation requires an integer step");

- Step = State.Builder.CreateTrunc(Step, ScalarIVTy);

- }

- // We build scalar steps for both integer and floating-point induction

- // variables. Here, we determine the kind of arithmetic we will perform.

- Instruction::BinaryOps AddOp;

- Instruction::BinaryOps MulOp;

- if (ScalarIVTy->isIntegerTy()) {

- AddOp = Instruction::Add;

- MulOp = Instruction::Mul;

- } else {

- AddOp = ID.getInductionOpcode();

- MulOp = Instruction::FMul;

- }

- // Determine the number of scalars we need to generate for each unroll

- // iteration.

- bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);

- // Compute the scalar steps and save the results in State.

- Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),

- ScalarIVTy->getScalarSizeInBits());

- Type *VecIVTy = nullptr;

- Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;

- if (!FirstLaneOnly && State.VF.isScalable()) {

- VecIVTy = VectorType::get(ScalarIVTy, State.VF);

- UnitStepVec =

- Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));

- SplatStep = Builder.CreateVectorSplat(State.VF, Step);

- SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);

- }

- unsigned StartPart = 0;

- unsigned EndPart = State.UF;

- unsigned StartLane = 0;

- unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();

- if (State.Instance) {

- StartPart = State.Instance->Part;

- EndPart = StartPart + 1;

- StartLane = State.Instance->Lane.getKnownLane();

- EndLane = StartLane + 1;

- }

- for (unsigned Part = StartPart; Part < EndPart; ++Part) {

- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);

- if (!FirstLaneOnly && State.VF.isScalable()) {

- auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);

- auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);

- if (ScalarIVTy->isFloatingPointTy())

- InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);

- auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);

- auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);

- State.set(Def, Add, Part);

- // It's useful to record the lane values too for the known minimum number

- // of elements so we do those below. This improves the code quality when

- // trying to extract the first element, for example.

- }

- if (ScalarIVTy->isFloatingPointTy())

- StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);

- for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {

- Value *StartIdx = Builder.CreateBinOp(

- AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));

- // The step returned by `createStepForVF` is a runtime-evaluated value

- // when VF is scalable. Otherwise, it should be folded into a Constant.

- assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&

- "Expected StartIdx to be folded to a constant when VF is not "

- "scalable");

- auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);

- auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);

- State.set(Def, Add, VPIteration(Part, Lane));

- }

/// Compute the transformed value of Index at offset StartValue using step

/// StepValue.

/// For integer induction, returns StartValue + Index * StepValue.

/// For pointer induction, returns StartValue[Index * StepValue].

/// FIXME: The newly created binary instructions should contain nsw/nuw

/// flags, which can be found from the original scalar operations.

-static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

- Value *StartValue, Value *Step,

- const InductionDescriptor &ID) {

+static Value *

+emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,

+ Value *Step,

+ InductionDescriptor::InductionKind InductionKind,

+ const BinaryOperator *InductionBinOp) {

Type *StepTy = Step->getType();

Value *CastedIndex = StepTy->isIntegerTy()

? B.CreateSExtOrTrunc(Index, StepTy)

@@ -2446,7 +2350,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

return B.CreateMul(X, Y);

};

- switch (ID.getKind()) {

+ switch (InductionKind) {

case InductionDescriptor::IK_IntInduction: {

assert(!isa<VectorType>(Index->getType()) &&

"Vector indices not supported for integer inductions yet");

@@ -2464,7 +2368,6 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,

assert(!isa<VectorType>(Index->getType()) &&

"Vector indices not supported for FP inductions yet");

assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");

- auto InductionBinOp = ID.getInductionBinOp();

assert(InductionBinOp &&

(InductionBinOp->getOpcode() == Instruction::FAdd ||

InductionBinOp->getOpcode() == Instruction::FSub) &&

@@ -2524,17 +2427,6 @@ static bool isIndvarOverflowCheckKnownFalse(

return false;

}

-void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,

- const VPIteration &Instance,

- VPTransformState &State) {

- Value *ScalarInst = State.get(Def, Instance);

- Value *VectorValue = State.get(Def, Instance.Part);

- VectorValue = Builder.CreateInsertElement(

- VectorValue, ScalarInst,

- Instance.Lane.getAsRuntimeExpr(State.Builder, VF));

- State.set(Def, VectorValue, Instance.Part);

// Return whether we allow using masked interleave-groups (for dealing with

// strided loads/stores that reside in predicated blocks, or for dealing

// with gaps).

@@ -2612,7 +2504,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

for (unsigned Part = 0; Part < UF; Part++) {

Value *AddrPart = State.get(Addr, VPIteration(Part, 0));

- State.setDebugLocFromInst(AddrPart);

+ if (auto *I = dyn_cast<Instruction>(AddrPart))

+ State.setDebugLocFrom(I->getDebugLoc());

// Notice current instruction could be any index. Need to adjust the address

// to the member of index 0.

@@ -2630,14 +2523,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))

InBounds = gep->isInBounds();

AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);

- // Cast to the vector pointer type.

- unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();

- Type *PtrTy = VecTy->getPointerTo(AddressSpace);

- AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));

+ AddrParts.push_back(AddrPart);

}

- State.setDebugLocFromInst(Instr);

+ State.setDebugLocFrom(Instr->getDebugLoc());

Value *PoisonVec = PoisonValue::get(VecTy);

auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](

@@ -2835,13 +2724,20 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,

bool IsVoidRetTy = Instr->getType()->isVoidTy();

Instruction *Cloned = Instr->clone();

- if (!IsVoidRetTy)

+ if (!IsVoidRetTy) {

Cloned->setName(Instr->getName() + ".cloned");

+#if !defined(NDEBUG)

+ // Verify that VPlan type inference results agree with the type of the

+ // generated values.

+ assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&

+ "inferred type and type from generated instructions do not match");

+#endif

+ }

RepRecipe->setFlags(Cloned);

- if (Instr->getDebugLoc())

- State.setDebugLocFromInst(Instr);

+ if (auto DL = Instr->getDebugLoc())

+ State.setDebugLocFrom(DL);

// Replace the operands of the cloned instructions with their scalar

// equivalents in the new loop.

@@ -3019,9 +2915,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

// dominator of the exit blocks.

DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

- ReplaceInstWithInst(

- TCCheckBlock->getTerminator(),

- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));

+ BranchInst &BI =

+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);

+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

+ setBranchWeights(BI, MinItersBypassWeights);

+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

LoopBypassBlocks.push_back(TCCheckBlock);

}

@@ -3151,15 +3049,17 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(

if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))

B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

- EndValue =

- emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);

+ EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),

+ Step, II.getKind(), II.getInductionBinOp());

EndValue->setName("ind.end");

// Compute the end value for the additional bypass (if applicable).

if (AdditionalBypass.first) {

- B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));

- EndValueFromAdditionalBypass = emitTransformedIndex(

- B, AdditionalBypass.second, II.getStartValue(), Step, II);

+ B.SetInsertPoint(AdditionalBypass.first,

+ AdditionalBypass.first->getFirstInsertionPt());

+ EndValueFromAdditionalBypass =

+ emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),

+ Step, II.getKind(), II.getInductionBinOp());

EndValueFromAdditionalBypass->setName("ind.end");

}

@@ -3240,16 +3140,25 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {

// 3) Otherwise, construct a runtime check.

if (!Cost->requiresScalarEpilogue(VF.isVector()) &&

!Cost->foldTailByMasking()) {

- Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,

- Count, VectorTripCount, "cmp.n",

- LoopMiddleBlock->getTerminator());

// Here we use the same DebugLoc as the scalar loop latch terminator instead

// of the corresponding compare because they may have ended up with

// different line numbers and we want to avoid awkward line stepping while

// debugging. Eg. if the compare has got a line number inside the loop.

- CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());

- cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);

+ // TODO: At the moment, CreateICmpEQ will simplify conditions with constant

+ // operands. Perform simplification directly on VPlan once the branch is

+ // modeled there.

+ IRBuilder<> B(LoopMiddleBlock->getTerminator());

+ B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());

+ Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");

+ BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());

+ BI.setCondition(CmpN);

+ if (hasBranchWeightMD(*ScalarLatchTerm)) {

+ // Assume that `Count % VectorTripCount` is equally distributed.

+ unsigned TripCount = UF * VF.getKnownMinValue();

+ assert(TripCount > 0 && "trip count should not be zero");

+ const uint32_t Weights[] = {1, TripCount - 1};

+ setBranchWeights(BI, Weights);

+ }

}

#ifdef EXPENSIVE_CHECKS

@@ -3373,7 +3282,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()

: State.get(StepVPV, {0, 0});

Value *Escape =

- emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);

+ emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,

+ II.getKind(), II.getInductionBinOp());

Escape->setName("ind.escape");

MissingVals[UI] = Escape;

}

@@ -3445,76 +3355,33 @@ static void cse(BasicBlock *BB) {

}

-InstructionCost LoopVectorizationCostModel::getVectorCallCost(

- CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {

- Function *F = CI->getCalledFunction();

- Type *ScalarRetTy = CI->getType();

- SmallVector<Type *, 4> Tys, ScalarTys;

- bool MaskRequired = Legal->isMaskRequired(CI);

- for (auto &ArgOp : CI->args())

- ScalarTys.push_back(ArgOp->getType());

+InstructionCost

+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,

+ ElementCount VF) const {

+ // We only need to calculate a cost if the VF is scalar; for actual vectors

+ // we should already have a pre-calculated cost at each VF.

+ if (!VF.isScalar())

+ return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;

- // Estimate cost of scalarized vector call. The source operands are assumed

- // to be vectors, so we need to extract individual elements from there,

- // execute VF scalar calls, and then gather the result into the vector return

- // value.

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- InstructionCost ScalarCallCost =

- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);

- if (VF.isScalar())

- return ScalarCallCost;

- // Compute corresponding vector type for return value and arguments.

- Type *RetTy = ToVectorTy(ScalarRetTy, VF);

- for (Type *ScalarTy : ScalarTys)

- Tys.push_back(ToVectorTy(ScalarTy, VF));

- // Compute costs of unpacking argument values for the scalar calls and

- // packing the return values to a vector.

- InstructionCost ScalarizationCost =

- getScalarizationOverhead(CI, VF, CostKind);

+ Type *RetTy = CI->getType();

+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))

+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))

+ return *RedCost;

- InstructionCost Cost =

- ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

- // If we can't emit a vector call for this function, then the currently found

- // cost is the cost we need to return.

- InstructionCost MaskCost = 0;

- VFShape Shape = VFShape::get(*CI, VF, MaskRequired);

- if (NeedsMask)

- *NeedsMask = MaskRequired;

- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

- // If we want an unmasked vector function but can't find one matching the VF,

- // maybe we can find vector function that does use a mask and synthesize

- // an all-true mask.

- if (!VecFunc && !MaskRequired) {

- Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);

- VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

- // If we found one, add in the cost of creating a mask

- if (VecFunc) {

- if (NeedsMask)

- *NeedsMask = true;

- MaskCost = TTI.getShuffleCost(

- TargetTransformInfo::SK_Broadcast,

- VectorType::get(

- IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),

- VF));

- }

+ SmallVector<Type *, 4> Tys;

+ for (auto &ArgOp : CI->args())

+ Tys.push_back(ArgOp->getType());

- // We don't support masked function calls yet, but we can scalarize a

- // masked call with branches (unless VF is scalable).

- if (!TLI || CI->isNoBuiltin() || !VecFunc)

- return VF.isScalable() ? InstructionCost::getInvalid() : Cost;

+ InstructionCost ScalarCallCost =

+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);

- // If the corresponding vector cost is cheaper, return its cost.

- InstructionCost VectorCallCost =

- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;

- if (VectorCallCost < Cost) {

- *Variant = VecFunc;

- Cost = VectorCallCost;

+ // If this is an intrinsic we may have a lower cost for it.

+ if (getVectorIntrinsicIDForCall(CI, TLI)) {

+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);

+ return std::min(ScalarCallCost, IntrinsicCost);

}

- return Cost;

+ return ScalarCallCost;

}

static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {

@@ -3558,146 +3425,8 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) {

return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;

}

-void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {

- // For every instruction `I` in MinBWs, truncate the operands, create a

- // truncated version of `I` and reextend its result. InstCombine runs

- // later and will remove any ext/trunc pairs.

- SmallPtrSet<Value *, 4> Erased;

- for (const auto &KV : Cost->getMinimalBitwidths()) {

- // If the value wasn't vectorized, we must maintain the original scalar

- // type. The absence of the value from State indicates that it

- // wasn't vectorized.

- // FIXME: Should not rely on getVPValue at this point.

- VPValue *Def = State.Plan->getVPValue(KV.first, true);

- if (!State.hasAnyVectorValue(Def))

- continue;

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *I = State.get(Def, Part);

- if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))

- continue;

- Type *OriginalTy = I->getType();

- Type *ScalarTruncatedTy =

- IntegerType::get(OriginalTy->getContext(), KV.second);

- auto *TruncatedTy = VectorType::get(

- ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());

- if (TruncatedTy == OriginalTy)

- continue;

- IRBuilder<> B(cast<Instruction>(I));

- auto ShrinkOperand = [&](Value *V) -> Value * {

- if (auto *ZI = dyn_cast<ZExtInst>(V))

- if (ZI->getSrcTy() == TruncatedTy)

- return ZI->getOperand(0);

- return B.CreateZExtOrTrunc(V, TruncatedTy);

- };

- // The actual instruction modification depends on the instruction type,

- // unfortunately.

- Value *NewI = nullptr;

- if (auto *BO = dyn_cast<BinaryOperator>(I)) {

- NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),

- ShrinkOperand(BO->getOperand(1)));

- // Any wrapping introduced by shrinking this operation shouldn't be

- // considered undefined behavior. So, we can't unconditionally copy

- // arithmetic wrapping flags to NewI.

- cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);

- } else if (auto *CI = dyn_cast<ICmpInst>(I)) {

- NewI =

- B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),

- ShrinkOperand(CI->getOperand(1)));

- } else if (auto *SI = dyn_cast<SelectInst>(I)) {

- NewI = B.CreateSelect(SI->getCondition(),

- ShrinkOperand(SI->getTrueValue()),

- ShrinkOperand(SI->getFalseValue()));

- } else if (auto *CI = dyn_cast<CastInst>(I)) {

- switch (CI->getOpcode()) {

- default:

- llvm_unreachable("Unhandled cast!");

- case Instruction::Trunc:

- NewI = ShrinkOperand(CI->getOperand(0));

- break;

- case Instruction::SExt:

- NewI = B.CreateSExtOrTrunc(

- CI->getOperand(0),

- smallestIntegerVectorType(OriginalTy, TruncatedTy));

- break;

- case Instruction::ZExt:

- NewI = B.CreateZExtOrTrunc(

- CI->getOperand(0),

- smallestIntegerVectorType(OriginalTy, TruncatedTy));

- break;

- }

- } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {

- auto Elements0 =

- cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();

- auto *O0 = B.CreateZExtOrTrunc(

- SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));

- auto Elements1 =

- cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();

- auto *O1 = B.CreateZExtOrTrunc(

- SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));

- NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());

- } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {

- // Don't do anything with the operands, just extend the result.

- continue;

- } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {

- auto Elements =

- cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();

- auto *O0 = B.CreateZExtOrTrunc(

- IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));

- auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);

- NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));

- } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {

- auto Elements =

- cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();

- auto *O0 = B.CreateZExtOrTrunc(

- EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));

- NewI = B.CreateExtractElement(O0, EE->getOperand(2));

- } else {

- // If we don't know what to do, be conservative and don't do anything.

- continue;

- }

- // Lastly, extend the result.

- NewI->takeName(cast<Instruction>(I));

- Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);

- I->replaceAllUsesWith(Res);

- cast<Instruction>(I)->eraseFromParent();

- Erased.insert(I);

- State.reset(Def, Res, Part);

- }

- // We'll have created a bunch of ZExts that are now parentless. Clean up.

- for (const auto &KV : Cost->getMinimalBitwidths()) {

- // If the value wasn't vectorized, we must maintain the original scalar

- // type. The absence of the value from State indicates that it

- // wasn't vectorized.

- // FIXME: Should not rely on getVPValue at this point.

- VPValue *Def = State.Plan->getVPValue(KV.first, true);

- if (!State.hasAnyVectorValue(Def))

- continue;

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *I = State.get(Def, Part);

- ZExtInst *Inst = dyn_cast<ZExtInst>(I);

- if (Inst && Inst->use_empty()) {

- Value *NewI = Inst->getOperand(0);

- Inst->eraseFromParent();

- State.reset(Def, NewI, Part);

- }

void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

VPlan &Plan) {

- // Insert truncates and extends for any truncated instructions as hints to

- // InstCombine.

- if (VF.isVector())

- truncateToMinimalBitwidths(State);

// Fix widened non-induction PHIs by setting up the PHI operands.

if (EnableVPlanNativePath)

fixNonInductionPHIs(Plan, State);

@@ -3710,6 +3439,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// Forget the original basic block.

PSE.getSE()->forgetLoop(OrigLoop);

+ PSE.getSE()->forgetBlockAndLoopDispositions();

// After vectorization, the exit blocks of the original loop will have

// additional predecessors. Invalidate SCEVs for the exit phis in case SE

@@ -3718,7 +3448,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

OrigLoop->getExitBlocks(ExitBlocks);

for (BasicBlock *Exit : ExitBlocks)

for (PHINode &PN : Exit->phis())

- PSE.getSE()->forgetValue(&PN);

+ PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);

VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();

Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);

@@ -3744,7 +3474,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated

// in the exit block, so update the builder.

- State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());

+ State.Builder.SetInsertPoint(State.CFG.ExitBB,

+ State.CFG.ExitBB->getFirstNonPHIIt());

for (const auto &KV : Plan.getLiveOuts())

KV.second->fixPhi(Plan, State);

@@ -3782,40 +3513,10 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {

VPBasicBlock *Header =

State.Plan->getVectorLoopRegion()->getEntryBasicBlock();

- // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores

- // sank outside of the loop would keep the same order as they had in the

- // original loop.

- SmallVector<VPReductionPHIRecipe *> ReductionPHIList;

for (VPRecipeBase &R : Header->phis()) {

if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))

- ReductionPHIList.emplace_back(ReductionPhi);

+ fixReduction(ReductionPhi, State);

}

- stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,

- const VPReductionPHIRecipe *R2) {

- auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;

- auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;

- // If neither of the recipes has an intermediate store, keep the order the

- // same.

- if (!IS1 && !IS2)

- return false;

- // If only one of the recipes has an intermediate store, then move it

- // towards the beginning of the list.

- if (IS1 && !IS2)

- return true;

- if (!IS1 && IS2)

- return false;

- // If both recipes have an intermediate store, then the recipe with the

- // later store should be processed earlier. So it should go to the beginning

- // of the list.

- return DT->dominates(IS2, IS1);

- });

- for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)

- fixReduction(ReductionPhi, State);

for (VPRecipeBase &R : Header->phis()) {

if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))

@@ -3929,7 +3630,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(

}

// Fix the initial value of the original recurrence in the scalar loop.

- Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());

+ Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());

PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");

auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();

@@ -3953,90 +3654,56 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

RecurKind RK = RdxDesc.getRecurrenceKind();

TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();

Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();

- State.setDebugLocFromInst(ReductionStartValue);

+ if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))

+ State.setDebugLocFrom(I->getDebugLoc());

VPValue *LoopExitInstDef = PhiR->getBackedgeValue();

- // This is the vector-clone of the value that leaves the loop.

- Type *VecTy = State.get(LoopExitInstDef, 0)->getType();

// Before each round, move the insertion point right between

// the PHIs and the values we are going to write.

// This allows us to write both PHINodes and the extractelement

// instructions.

- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

+ Builder.SetInsertPoint(LoopMiddleBlock,

+ LoopMiddleBlock->getFirstInsertionPt());

- State.setDebugLocFromInst(LoopExitInst);

+ State.setDebugLocFrom(LoopExitInst->getDebugLoc());

Type *PhiTy = OrigPhi->getType();

- VPBasicBlock *LatchVPBB =

- PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();

- BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];

// If tail is folded by masking, the vector value to leave the loop should be

// a Select choosing between the vectorized LoopExitInst and vectorized Phi,

// instead of the former. For an inloop reduction the reduction will already

// be predicated, and does not need to be handled here.

if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);

- SelectInst *Sel = nullptr;

- for (User *U : VecLoopExitInst->users()) {

- if (isa<SelectInst>(U)) {

- assert(!Sel && "Reduction exit feeding two selects");

- Sel = cast<SelectInst>(U);

- } else

- assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");

- }

- assert(Sel && "Reduction exit feeds no select");

- State.reset(LoopExitInstDef, Sel, Part);

- if (isa<FPMathOperator>(Sel))

- Sel->setFastMathFlags(RdxDesc.getFastMathFlags());

- // If the target can create a predicated operator for the reduction at no

- // extra cost in the loop (for example a predicated vadd), it can be

- // cheaper for the select to remain in the loop than be sunk out of it,

- // and so use the select value for the phi instead of the old

- // LoopExitValue.

- if (PreferPredicatedReductionSelect ||

- TTI->preferPredicatedReductionSelect(

- RdxDesc.getOpcode(), PhiTy,

- TargetTransformInfo::ReductionFlags())) {

- auto *VecRdxPhi =

- cast<PHINode>(State.get(PhiR, Part));

- VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);

+ VPValue *Def = nullptr;

+ for (VPUser *U : LoopExitInstDef->users()) {

+ auto *S = dyn_cast<VPInstruction>(U);

+ if (S && S->getOpcode() == Instruction::Select) {

+ Def = S;

+ break;

}

+ if (Def)

+ LoopExitInstDef = Def;

}

+ VectorParts RdxParts(UF);

+ for (unsigned Part = 0; Part < UF; ++Part)

+ RdxParts[Part] = State.get(LoopExitInstDef, Part);

// If the vector reduction can be performed in a smaller type, we truncate

// then extend the loop exit value to enable InstCombine to evaluate the

// entire expression in the smaller type.

if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {

- assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");

+ Builder.SetInsertPoint(LoopMiddleBlock,

+ LoopMiddleBlock->getFirstInsertionPt());

Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);

- Builder.SetInsertPoint(VectorLoopLatch->getTerminator());

- VectorParts RdxParts(UF);

- for (unsigned Part = 0; Part < UF; ++Part) {

- RdxParts[Part] = State.get(LoopExitInstDef, Part);

- Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);

- Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)

- : Builder.CreateZExt(Trunc, VecTy);

- for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))

- if (U != Trunc) {

- U->replaceUsesOfWith(RdxParts[Part], Extnd);

- RdxParts[Part] = Extnd;

- }

- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

for (unsigned Part = 0; Part < UF; ++Part) {

RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);

- State.reset(LoopExitInstDef, RdxParts[Part], Part);

}

// Reduce all of the unrolled parts into a single vector.

- Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);

+ Value *ReducedPartRdx = RdxParts[0];

unsigned Op = RecurrenceDescriptor::getOpcode(RK);

// The middle block terminator has already been assigned a DebugLoc here (the

@@ -4046,21 +3713,21 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// conditional branch, and (c) other passes may add new predecessors which

// terminate on this line. This is the easiest way to ensure we don't

// accidentally cause an extra step back into the loop while debugging.

- State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());

+ State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());

if (PhiR->isOrdered())

- ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);

+ ReducedPartRdx = RdxParts[UF - 1];

else {

// Floating-point operations should have some FMF to enable the reduction.

IRBuilderBase::FastMathFlagGuard FMFG(Builder);

Builder.setFastMathFlags(RdxDesc.getFastMathFlags());

for (unsigned Part = 1; Part < UF; ++Part) {

- Value *RdxPart = State.get(LoopExitInstDef, Part);

- if (Op != Instruction::ICmp && Op != Instruction::FCmp) {

+ Value *RdxPart = RdxParts[Part];

+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)

ReducedPartRdx = Builder.CreateBinOp(

(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");

- } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))

- ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,

- ReducedPartRdx, RdxPart);

+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))

+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,

+ ReducedPartRdx, RdxPart);

else

ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);

}

@@ -4070,7 +3737,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// target reduction in the loop using a Reduction recipe.

if (VF.isVector() && !PhiR->isInLoop()) {

ReducedPartRdx =

- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);

+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);

// If the reduction can be performed in a smaller type, we need to extend

// the reduction to the wider type before we branch to the original loop.

if (PhiTy != RdxDesc.getRecurrenceType())

@@ -4107,7 +3774,8 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,

// inside the loop, create the final store here.

if (StoreInst *SI = RdxDesc.IntermediateStore) {

StoreInst *NewSI =

- Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());

+ Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),

+ SI->getAlign());

propagateMetadata(NewSI, SI);

// If the reduction value is used in other places,

@@ -4436,7 +4104,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(

default:

return true;

case Instruction::Call:

- return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);

+ if (VF.isScalar())

+ return true;

+ return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))

+ .Kind == CM_Scalarize;

case Instruction::Load:

case Instruction::Store: {

auto *Ptr = getLoadStorePointerOperand(I);

@@ -4988,7 +4659,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

}

FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

- unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {

+ unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);

unsigned SmallestType, WidestType;

std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();

@@ -5076,12 +4747,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

FixedScalableVFPair Result(ElementCount::getFixed(1),

ElementCount::getScalable(0));

if (auto MaxVF =

- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,

+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,

MaxSafeFixedVF, FoldTailByMasking))

Result.FixedVF = MaxVF;

if (auto MaxVF =

- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,

+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,

MaxSafeScalableVF, FoldTailByMasking))

if (MaxVF.isScalable()) {

Result.ScalableVF = MaxVF;

@@ -5105,6 +4776,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

}

unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

+ unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);

LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

if (TC == 1) {

reportVectorizationFailure("Single iteration (non) loop",

@@ -5115,7 +4787,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

switch (ScalarEpilogueStatus) {

case CM_ScalarEpilogueAllowed:

- return computeFeasibleMaxVF(TC, UserVF, false);

+ return computeFeasibleMaxVF(MaxTC, UserVF, false);

case CM_ScalarEpilogueNotAllowedUsePredicate:

[[fallthrough]];

case CM_ScalarEpilogueNotNeededUsePredicate:

@@ -5153,7 +4825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "

"scalar epilogue instead.\n");

ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

- return computeFeasibleMaxVF(TC, UserVF, false);

+ return computeFeasibleMaxVF(MaxTC, UserVF, false);

}

return FixedScalableVFPair::getNone();

}

@@ -5170,7 +4842,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

}

- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);

+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);

// Avoid tail folding if the trip count is known to be a multiple of any VF

// we choose.

@@ -5246,7 +4918,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

}

ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

- unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,

+ unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,

ElementCount MaxSafeVF, bool FoldTailByMasking) {

bool ComputeScalableMaxVF = MaxSafeVF.isScalable();

const TypeSize WidestRegister = TTI.getRegisterBitWidth(

@@ -5285,31 +4957,35 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

}

// When a scalar epilogue is required, at least one iteration of the scalar

- // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a

+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a

// max VF that results in a dead vector loop.

- if (ConstTripCount > 0 && requiresScalarEpilogue(true))

- ConstTripCount -= 1;

- if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&

- (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {

- // If loop trip count (TC) is known at compile time there is no point in

- // choosing VF greater than TC (as done in the loop below). Select maximum

- // power of two which doesn't exceed TC.

- // If MaxVectorElementCount is scalable, we only fall back on a fixed VF

- // when the TC is less than or equal to the known number of lanes.

- auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);

+ if (MaxTripCount > 0 && requiresScalarEpilogue(true))

+ MaxTripCount -= 1;

+ if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&

+ (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {

+ // If upper bound loop trip count (TC) is known at compile time there is no

+ // point in choosing VF greater than TC (as done in the loop below). Select

+ // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is

+ // scalable, we only fall back on a fixed VF when the TC is less than or

+ // equal to the known number of lanes.

+ auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);

LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "

"exceeding the constant trip count: "

- << ClampedConstTripCount << "\n");

- return ElementCount::getFixed(ClampedConstTripCount);

+ << ClampedUpperTripCount << "\n");

+ return ElementCount::get(

+ ClampedUpperTripCount,

+ FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);

}

TargetTransformInfo::RegisterKind RegKind =

ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector

: TargetTransformInfo::RGK_FixedWidthVector;

ElementCount MaxVF = MaxVectorElementCount;

- if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&

- TTI.shouldMaximizeVectorBandwidth(RegKind))) {

+ if (MaximizeBandwidth ||

+ (MaximizeBandwidth.getNumOccurrences() == 0 &&

+ (TTI.shouldMaximizeVectorBandwidth(RegKind) ||

+ (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {

auto MaxVectorElementCountMaxBW = ElementCount::get(

llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),

ComputeScalableMaxVF);

@@ -5981,7 +5657,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

HasReductions &&

any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {

const RecurrenceDescriptor &RdxDesc = Reduction.second;

- return RecurrenceDescriptor::isSelectCmpRecurrenceKind(

+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(

RdxDesc.getRecurrenceKind());

});

if (HasSelectCmpReductions) {

@@ -6149,6 +5825,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

if (ValuesToIgnore.count(I))

continue;

+ collectInLoopReductions();

// For each VF find the maximum usage of registers.

for (unsigned j = 0, e = VFs.size(); j < e; ++j) {

// Count the number of registers used, per register class, given all open

@@ -6668,10 +6346,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

std::optional<InstructionCost>

LoopVectorizationCostModel::getReductionPatternCost(

- Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {

+ Instruction *I, ElementCount VF, Type *Ty,

+ TTI::TargetCostKind CostKind) const {

using namespace llvm::PatternMatch;

// Early exit for no inloop reductions

- if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))

+ if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))

return std::nullopt;

auto *VectorTy = cast<VectorType>(Ty);

@@ -6706,10 +6385,10 @@ LoopVectorizationCostModel::getReductionPatternCost(

// Find the reduction this chain is a part of and calculate the basic cost of

// the reduction on its own.

- Instruction *LastChain = InLoopReductionImmediateChains[RetI];

+ Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);

Instruction *ReductionPhi = LastChain;

while (!isa<PHINode>(ReductionPhi))

- ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];

+ ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);

const RecurrenceDescriptor &RdxDesc =

Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;

@@ -7127,6 +6806,168 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

}

+void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {

+ assert(!VF.isScalar() &&

+ "Trying to set a vectorization decision for a scalar VF");

+ for (BasicBlock *BB : TheLoop->blocks()) {

+ // For each instruction in the old loop.

+ for (Instruction &I : *BB) {

+ CallInst *CI = dyn_cast<CallInst>(&I);

+ if (!CI)

+ continue;

+ InstructionCost ScalarCost = InstructionCost::getInvalid();

+ InstructionCost VectorCost = InstructionCost::getInvalid();

+ InstructionCost IntrinsicCost = InstructionCost::getInvalid();

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ Function *ScalarFunc = CI->getCalledFunction();

+ Type *ScalarRetTy = CI->getType();

+ SmallVector<Type *, 4> Tys, ScalarTys;

+ bool MaskRequired = Legal->isMaskRequired(CI);

+ for (auto &ArgOp : CI->args())

+ ScalarTys.push_back(ArgOp->getType());

+ // Compute corresponding vector type for return value and arguments.

+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);

+ for (Type *ScalarTy : ScalarTys)

+ Tys.push_back(ToVectorTy(ScalarTy, VF));

+ // An in-loop reduction using an fmuladd intrinsic is a special case;

+ // we don't want the normal cost for that intrinsic.

+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))

+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {

+ setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,

+ getVectorIntrinsicIDForCall(CI, TLI),

+ std::nullopt, *RedCost);

+ continue;

+ }

+ // Estimate cost of scalarized vector call. The source operands are

+ // assumed to be vectors, so we need to extract individual elements from

+ // there, execute VF scalar calls, and then gather the result into the

+ // vector return value.

+ InstructionCost ScalarCallCost =

+ TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);

+ // Compute costs of unpacking argument values for the scalar calls and

+ // packing the return values to a vector.

+ InstructionCost ScalarizationCost =

+ getScalarizationOverhead(CI, VF, CostKind);

+ ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

+ // Find the cost of vectorizing the call, if we can find a suitable

+ // vector variant of the function.

+ bool UsesMask = false;

+ VFInfo FuncInfo;

+ Function *VecFunc = nullptr;

+ // Search through any available variants for one we can use at this VF.

+ for (VFInfo &Info : VFDatabase::getMappings(*CI)) {

+ // Must match requested VF.

+ if (Info.Shape.VF != VF)

+ continue;

+ // Must take a mask argument if one is required

+ if (MaskRequired && !Info.isMasked())

+ continue;

+ // Check that all parameter kinds are supported

+ bool ParamsOk = true;

+ for (VFParameter Param : Info.Shape.Parameters) {

+ switch (Param.ParamKind) {

+ case VFParamKind::Vector:

+ break;

+ case VFParamKind::OMP_Uniform: {

+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);

+ // Make sure the scalar parameter in the loop is invariant.

+ if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),

+ TheLoop))

+ ParamsOk = false;

+ break;

+ }

+ case VFParamKind::OMP_Linear: {

+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);

+ // Find the stride for the scalar parameter in this loop and see if

+ // it matches the stride for the variant.

+ // TODO: do we need to figure out the cost of an extract to get the

+ // first lane? Or do we hope that it will be folded away?

+ ScalarEvolution *SE = PSE.getSE();

+ const auto *SAR =

+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));

+ if (!SAR || SAR->getLoop() != TheLoop) {

+ ParamsOk = false;

+ break;

+ }

+ const SCEVConstant *Step =

+ dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));

+ if (!Step ||

+ Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)

+ ParamsOk = false;

+ break;

+ }

+ case VFParamKind::GlobalPredicate:

+ UsesMask = true;

+ break;

+ default:

+ ParamsOk = false;

+ break;

+ }

+ if (!ParamsOk)

+ continue;

+ // Found a suitable candidate, stop here.

+ VecFunc = CI->getModule()->getFunction(Info.VectorName);

+ FuncInfo = Info;

+ break;

+ }

+ // Add in the cost of synthesizing a mask if one wasn't required.

+ InstructionCost MaskCost = 0;

+ if (VecFunc && UsesMask && !MaskRequired)

+ MaskCost = TTI.getShuffleCost(

+ TargetTransformInfo::SK_Broadcast,

+ VectorType::get(IntegerType::getInt1Ty(

+ VecFunc->getFunctionType()->getContext()),

+ VF));

+ if (TLI && VecFunc && !CI->isNoBuiltin())

+ VectorCost =

+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;

+ // Find the cost of an intrinsic; some targets may have instructions that

+ // perform the operation without needing an actual call.

+ Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);

+ if (IID != Intrinsic::not_intrinsic)

+ IntrinsicCost = getVectorIntrinsicCost(CI, VF);

+ InstructionCost Cost = ScalarCost;

+ InstWidening Decision = CM_Scalarize;

+ if (VectorCost <= Cost) {

+ Cost = VectorCost;

+ Decision = CM_VectorCall;

+ }

+ if (IntrinsicCost <= Cost) {

+ Cost = IntrinsicCost;

+ Decision = CM_IntrinsicCall;

+ }

+ setCallWideningDecision(CI, VF, Decision, VecFunc, IID,

+ FuncInfo.getParamIndexForOptionalMask(), Cost);

+ }

InstructionCost

LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

Type *&VectorTy) {

@@ -7156,7 +6997,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

// With the exception of GEPs and PHIs, after scalarization there should

// only be one copy of the instruction generated in the loop. This is

// because the VF is either 1, or any instructions that need scalarizing

- // have already been dealt with by the the time we get here. As a result,

+ // have already been dealt with by the time we get here. As a result,

// it means we don't have to multiply the instruction cost by VF.

assert(I->getOpcode() == Instruction::GetElementPtr ||

I->getOpcode() == Instruction::PHI ||

@@ -7384,6 +7225,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

return TTI::CastContextHint::Reversed;

case LoopVectorizationCostModel::CM_Unknown:

llvm_unreachable("Instr did not go through cost modelling?");

+ case LoopVectorizationCostModel::CM_VectorCall:

+ case LoopVectorizationCostModel::CM_IntrinsicCall:

+ llvm_unreachable_internal("Instr has invalid widening decision");

}

llvm_unreachable("Unhandled case!");

@@ -7441,19 +7285,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);

}

- case Instruction::Call: {

- if (RecurrenceDescriptor::isFMulAddIntrinsic(I))

- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))

- return *RedCost;

- Function *Variant;

- CallInst *CI = cast<CallInst>(I);

- InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);

- if (getVectorIntrinsicIDForCall(CI, TLI)) {

- InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);

- return std::min(CallCost, IntrinsicCost);

- }

- return CallCost;

- }

+ case Instruction::Call:

+ return getVectorCallCost(cast<CallInst>(I), VF);

case Instruction::ExtractValue:

return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);

case Instruction::Alloca:

@@ -7521,8 +7354,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() {

SmallVector<Instruction *, 4> ReductionOperations =

RdxDesc.getReductionOpChain(Phi, TheLoop);

bool InLoop = !ReductionOperations.empty();

if (InLoop) {

- InLoopReductionChains[Phi] = ReductionOperations;

+ InLoopReductions.insert(Phi);

// Add the elements to InLoopReductionImmediateChains for cost modelling.

Instruction *LastChain = Phi;

for (auto *I : ReductionOperations) {

@@ -7535,21 +7369,38 @@ void LoopVectorizationCostModel::collectInLoopReductions() {

}

+VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,

+ DebugLoc DL, const Twine &Name) {

+ assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&

+ Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");

+ return tryInsertInstruction(

+ new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));

+// This function will select a scalable VF if the target supports scalable

+// vectors and a fixed one otherwise.

// TODO: we could return a pair of values that specify the max VF and

// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of

// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment

// doesn't have a cost model that can choose which plan to execute if

// more than one is generated.

-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,

- LoopVectorizationCostModel &CM) {

+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,

+ LoopVectorizationCostModel &CM) {

unsigned WidestType;

std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();

- return WidestVectorRegBits / WidestType;

+ TargetTransformInfo::RegisterKind RegKind =

+ TTI.enableScalableVectorization()

+ ? TargetTransformInfo::RGK_ScalableVector

+ : TargetTransformInfo::RGK_FixedWidthVector;

+ TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);

+ unsigned N = RegSize.getKnownMinValue() / WidestType;

+ return ElementCount::get(N, RegSize.isScalable());

}

VectorizationFactor

LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

- assert(!UserVF.isScalable() && "scalable vectors not yet supported");

ElementCount VF = UserVF;

// Outer loop handling: They may require CFG and instruction level

// transformations before even evaluating whether vectorization is profitable.

@@ -7559,10 +7410,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

// If the user doesn't provide a vectorization factor, determine a

// reasonable one.

if (UserVF.isZero()) {

- VF = ElementCount::getFixed(determineVPlanVF(

- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

- .getFixedValue(),

- CM));

+ VF = determineVPlanVF(TTI, CM);

LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");

// Make sure we have a VF > 1 for stress testing.

@@ -7571,6 +7419,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

<< "overriding computed VF.\n");

VF = ElementCount::getFixed(4);

}

+ } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&

+ !ForceTargetSupportsScalableVectors) {

+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "

+ << "not supported by the target.\n");

+ reportVectorizationFailure(

+ "Scalable vectorization requested but not supported by the target",

+ "the scalable user-specified vectorization width for outer-loop "

+ "vectorization cannot be used because the target does not support "

+ "scalable vectors.",

+ "ScalableVFUnfeasible", ORE, OrigLoop);

+ return VectorizationFactor::Disabled();

}

assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");

assert(isPowerOf2_32(VF.getKnownMinValue()) &&

@@ -7624,9 +7483,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

"VF needs to be a power of two");

// Collect the instructions (and their associated costs) that will be more

// profitable to scalarize.

+ CM.collectInLoopReductions();

if (CM.selectUserVectorizationFactor(UserVF)) {

LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

- CM.collectInLoopReductions();

buildVPlansWithVPRecipes(UserVF, UserVF);

if (!hasPlanWithVF(UserVF)) {

LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF

@@ -7650,6 +7509,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)

VFCandidates.insert(VF);

+ CM.collectInLoopReductions();

for (const auto &VF : VFCandidates) {

// Collect Uniform and Scalar instructions after vectorization with VF.

CM.collectUniformsAndScalars(VF);

@@ -7660,7 +7520,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

CM.collectInstsToScalarize(VF);

}

- CM.collectInLoopReductions();

buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);

buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);

@@ -7705,7 +7564,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {

if (MD) {

const auto *S = dyn_cast<MDString>(MD->getOperand(0));

IsUnrollMetadata =

- S && S->getString().startswith("llvm.loop.unroll.disable");

+ S && S->getString().starts_with("llvm.loop.unroll.disable");

}

MDs.push_back(LoopID->getOperand(i));

}

@@ -7729,7 +7588,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {

SCEV2ValueTy LoopVectorizationPlanner::executePlan(

ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,

InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,

- DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {

+ const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {

assert(BestVPlan.hasVF(BestVF) &&

"Trying to execute plan with unsupported VF");

assert(BestVPlan.hasUF(BestUF) &&

@@ -7745,7 +7604,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(

VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

// Perform the actual loop transformation.

- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};

+ VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,

+ OrigLoop->getHeader()->getContext());

// 0. Generate SCEV-dependent code into the preheader, including TripCount,

// before making any changes to the CFG.

@@ -7798,9 +7658,9 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(

//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.

- BestVPlan.prepareToExecute(

- ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),

- CanonicalIVStartValue, State, IsEpilogueVectorization);

+ BestVPlan.prepareToExecute(ILV.getTripCount(),

+ ILV.getOrCreateVectorTripCount(nullptr),

+ CanonicalIVStartValue, State);

BestVPlan.execute(&State);

@@ -7964,9 +7824,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

EPI.TripCount = Count;

}

- ReplaceInstWithInst(

- TCCheckBlock->getTerminator(),

- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));

+ BranchInst &BI =

+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);

+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

+ setBranchWeights(BI, MinItersBypassWeights);

+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

return TCCheckBlock;

}

@@ -8064,8 +7926,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(

// Generate a resume induction for the vector epilogue and put it in the

// vector epilogue preheader

Type *IdxTy = Legal->getWidestInductionType();

- PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",

- LoopVectorPreHeader->getFirstNonPHI());

+ PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");

+ EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());

EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);

EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),

EPI.MainLoopIterationCountCheck);

@@ -8110,9 +7972,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(

EPI.EpilogueVF, EPI.EpilogueUF),

"min.epilog.iters.check");

- ReplaceInstWithInst(

- Insert->getTerminator(),

- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));

+ BranchInst &BI =

+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);

+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {

+ unsigned MainLoopStep = UF * VF.getKnownMinValue();

+ unsigned EpilogueLoopStep =

+ EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();

+ // We assume the remaining `Count` is equally distributed in

+ // [0, MainLoopStep)

+ // So the probability for `Count < EpilogueLoopStep` should be

+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep

+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);

+ const uint32_t Weights[] = {EstimatedSkipCount,

+ MainLoopStep - EstimatedSkipCount};

+ setBranchWeights(BI, Weights);

+ }

+ ReplaceInstWithInst(Insert->getTerminator(), &BI);

LoopBypassBlocks.push_back(Insert);

return Insert;

@@ -8206,6 +8081,33 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,

return EdgeMaskCache[Edge] = EdgeMask;

}

+void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {

+ BasicBlock *Header = OrigLoop->getHeader();

+ // When not folding the tail, use nullptr to model all-true mask.

+ if (!CM.foldTailByMasking()) {

+ BlockMaskCache[Header] = nullptr;

+ return;

+ }

+ // Introduce the early-exit compare IV <= BTC to form header block mask.

+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by

+ // constructing the desired canonical IV in the header block as its first

+ // non-phi instructions.

+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();

+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());

+ HeaderVPBB->insert(IV, NewInsertionPoint);

+ VPBuilder::InsertPointGuard Guard(Builder);

+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);

+ VPValue *BlockMask = nullptr;

+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);

+ BlockMaskCache[Header] = BlockMask;

VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

assert(OrigLoop->contains(BB) && "Block is not a part of a loop");

@@ -8214,45 +8116,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {

if (BCEntryIt != BlockMaskCache.end())

return BCEntryIt->second;

+ assert(OrigLoop->getHeader() != BB &&

+ "Loop header must have cached block mask");

// All-one mask is modelled as no-mask following the convention for masked

// load/store/gather/scatter. Initialize BlockMask to no-mask.

VPValue *BlockMask = nullptr;

- if (OrigLoop->getHeader() == BB) {

- if (!CM.blockNeedsPredicationForAnyReason(BB))

- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.

- assert(CM.foldTailByMasking() && "must fold the tail");

- // If we're using the active lane mask for control flow, then we get the

- // mask from the active lane mask PHI that is cached in the VPlan.

- TailFoldingStyle TFStyle = CM.getTailFoldingStyle();

- if (useActiveLaneMaskForControlFlow(TFStyle))

- return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();

- // Introduce the early-exit compare IV <= BTC to form header block mask.

- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by

- // constructing the desired canonical IV in the header block as its first

- // non-phi instructions.

- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();

- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());

- HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());

- VPBuilder::InsertPointGuard Guard(Builder);

- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);

- if (useActiveLaneMask(TFStyle)) {

- VPValue *TC = Plan.getTripCount();

- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},

- nullptr, "active.lane.mask");

- } else {

- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});

- }

- return BlockMaskCache[BB] = BlockMask;

- }

// This is the block mask. We OR all incoming edges.

for (auto *Predecessor : predecessors(BB)) {

VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);

@@ -8458,22 +8327,15 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

bool ShouldUseVectorIntrinsic =

ID && LoopVectorizationPlanner::getDecisionAndClampRange(

[&](ElementCount VF) -> bool {

- Function *Variant;

- // Is it beneficial to perform intrinsic call compared to lib

- // call?

- InstructionCost CallCost =

- CM.getVectorCallCost(CI, VF, &Variant);

- InstructionCost IntrinsicCost =

- CM.getVectorIntrinsicCost(CI, VF);

- return IntrinsicCost <= CallCost;

+ return CM.getCallWideningDecision(CI, VF).Kind ==

+ LoopVectorizationCostModel::CM_IntrinsicCall;

Range);

if (ShouldUseVectorIntrinsic)

return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);

Function *Variant = nullptr;

- ElementCount VariantVF;

- bool NeedsMask = false;

+ std::optional<unsigned> MaskPos;

// Is better to call a vectorized version of the function than to to scalarize

// the call?

auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(

@@ -8492,16 +8354,19 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

// finds a valid variant.

if (Variant)

return false;

- CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);

- // If we found a valid vector variant at this VF, then store the VF

- // in case we need to generate a mask.

- if (Variant)

- VariantVF = VF;

- return Variant != nullptr;

+ LoopVectorizationCostModel::CallWideningDecision Decision =

+ CM.getCallWideningDecision(CI, VF);

+ if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {

+ Variant = Decision.Variant;

+ MaskPos = Decision.MaskPos;

+ return true;

+ }

+ return false;

Range);

if (ShouldUseVectorCall) {

- if (NeedsMask) {

+ if (MaskPos.has_value()) {

// We have 2 cases that would require a mask:

// 1) The block needs to be predicated, either due to a conditional

// in the scalar loop or use of an active lane mask with

@@ -8516,17 +8381,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(

IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));

- VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);

- unsigned MaskPos = 0;

- for (const VFInfo &Info : VFDatabase::getMappings(*CI))

- if (Info.Shape == Shape) {

- assert(Info.isMasked() && "Vector function info shape mismatch");

- MaskPos = Info.getParamIndexForOptionalMask().value();

- break;

- }

- Ops.insert(Ops.begin() + MaskPos, Mask);

+ Ops.insert(Ops.begin() + *MaskPos, Mask);

}

return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),

@@ -8747,8 +8602,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

}

if (auto *CI = dyn_cast<CastInst>(Instr)) {

- return toVPRecipeResult(

- new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));

+ return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],

+ CI->getType(), *CI));

}

return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));

@@ -8758,27 +8613,26 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

ElementCount MaxVF) {

assert(OrigLoop->isInnermost() && "Inner loop expected.");

- // Add assume instructions we need to drop to DeadInstructions, to prevent

- // them from being added to the VPlan.

- // TODO: We only need to drop assumes in blocks that get flattend. If the

- // control flow is preserved, we should keep them.

- SmallPtrSet<Instruction *, 4> DeadInstructions;

- auto &ConditionalAssumes = Legal->getConditionalAssumes();

- DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());

auto MaxVFTimes2 = MaxVF * 2;

for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {

VFRange SubRange = {VF, MaxVFTimes2};

- if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))

- VPlans.push_back(std::move(*Plan));

+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {

+ // Now optimize the initial VPlan.

+ if (!Plan->hasVF(ElementCount::getFixed(1)))

+ VPlanTransforms::truncateToMinimalBitwidths(

+ *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());

+ VPlanTransforms::optimize(*Plan, *PSE.getSE());

+ assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

+ VPlans.push_back(std::move(Plan));

+ }

VF = SubRange.End;

}

// Add the necessary canonical IV and branch recipes required to control the

// loop.

-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

- TailFoldingStyle Style) {

+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,

+ DebugLoc DL) {

Value *StartIdx = ConstantInt::get(IdxTy, 0);

auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);

@@ -8790,102 +8644,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,

// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar

// IV by VF * UF.

- bool HasNUW = Style == TailFoldingStyle::None;

auto *CanonicalIVIncrement =

- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW

- : VPInstruction::CanonicalIVIncrement,

- {CanonicalIVPHI}, DL, "index.next");

+ new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},

+ {HasNUW, false}, DL, "index.next");

CanonicalIVPHI->addOperand(CanonicalIVIncrement);

VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

- if (useActiveLaneMaskForControlFlow(Style)) {

- // Create the active lane mask instruction in the vplan preheader.

- VPBasicBlock *VecPreheader =

- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());

- // We can't use StartV directly in the ActiveLaneMask VPInstruction, since

- // we have to take unrolling into account. Each part needs to start at

- // Part * VF

- auto *CanonicalIVIncrementParts =

- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

- : VPInstruction::CanonicalIVIncrementForPart,

- {StartV}, DL, "index.part.next");

- VecPreheader->appendRecipe(CanonicalIVIncrementParts);

- // Create the ActiveLaneMask instruction using the correct start values.

- VPValue *TC = Plan.getTripCount();

- VPValue *TripCount, *IncrementValue;

- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

- // When avoiding a runtime check, the active.lane.mask inside the loop

- // uses a modified trip count and the induction variable increment is

- // done after the active.lane.mask intrinsic is called.

- auto *TCMinusVF =

- new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);

- VecPreheader->appendRecipe(TCMinusVF);

- IncrementValue = CanonicalIVPHI;

- TripCount = TCMinusVF;

- } else {

- // When the loop is guarded by a runtime overflow check for the loop

- // induction variable increment by VF, we can increment the value before

- // the get.active.lane mask and use the unmodified tripcount.

- EB->appendRecipe(CanonicalIVIncrement);

- IncrementValue = CanonicalIVIncrement;

- TripCount = TC;

- }

- auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,

- {CanonicalIVIncrementParts, TC}, DL,

- "active.lane.mask.entry");

- VecPreheader->appendRecipe(EntryALM);

- // Now create the ActiveLaneMaskPhi recipe in the main loop using the

- // preheader ActiveLaneMask instruction.

- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());

- Header->insert(LaneMaskPhi, Header->getFirstNonPhi());

- // Create the active lane mask for the next iteration of the loop.

- CanonicalIVIncrementParts =

- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW

- : VPInstruction::CanonicalIVIncrementForPart,

- {IncrementValue}, DL);

- EB->appendRecipe(CanonicalIVIncrementParts);

- auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,

- {CanonicalIVIncrementParts, TripCount}, DL,

- "active.lane.mask.next");

- EB->appendRecipe(ALM);

- LaneMaskPhi->addOperand(ALM);

- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

- // Do the increment of the canonical IV after the active.lane.mask, because

- // that value is still based off %CanonicalIVPHI

- EB->appendRecipe(CanonicalIVIncrement);

- }

- // We have to invert the mask here because a true condition means jumping

- // to the exit block.

- auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);

- EB->appendRecipe(NotMask);

- VPInstruction *BranchBack =

- new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);

- EB->appendRecipe(BranchBack);

- } else {

- EB->appendRecipe(CanonicalIVIncrement);

+ EB->appendRecipe(CanonicalIVIncrement);

- // Add the BranchOnCount VPInstruction to the latch.

- VPInstruction *BranchBack = new VPInstruction(

- VPInstruction::BranchOnCount,

- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

- EB->appendRecipe(BranchBack);

- }

+ // Add the BranchOnCount VPInstruction to the latch.

+ VPInstruction *BranchBack =

+ new VPInstruction(VPInstruction::BranchOnCount,

+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);

+ EB->appendRecipe(BranchBack);

}

// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the

// original exit block.

-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,

- VPBasicBlock *MiddleVPBB, Loop *OrigLoop,

+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,

VPlan &Plan) {

BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();

BasicBlock *ExitingBB = OrigLoop->getExitingBlock();

@@ -8902,8 +8678,8 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,

}

-std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {

+VPlanPtr

+LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

@@ -8914,24 +8690,6 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// process after constructing the initial VPlan.

// ---------------------------------------------------------------------------

- for (const auto &Reduction : CM.getInLoopReductionChains()) {

- PHINode *Phi = Reduction.first;

- RecurKind Kind =

- Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();

- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;

- RecipeBuilder.recordRecipeOf(Phi);

- for (const auto &R : ReductionOperations) {

- RecipeBuilder.recordRecipeOf(R);

- // For min/max reductions, where we have a pair of icmp/select, we also

- // need to record the ICmp recipe, so it can be removed later.

- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&

- "Only min/max recurrences allowed for inloop reductions");

- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))

- RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));

- }

// For each interleave group which is relevant for this (possibly trimmed)

// Range, add it to the set of groups to be later applied to the VPlan and add

// placeholders for its members' Recipes which we'll be replacing with a

@@ -8972,23 +8730,27 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");

VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");

VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);

- auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");

- VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());

- VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");

- VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);

+ Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);

+ Plan->getVectorLoopRegion()->setExiting(LatchVPBB);

// Don't use getDecisionAndClampRange here, because we don't know the UF

// so this function is better to be conservative, rather than to split

// it up into different VPlans.

+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.

bool IVUpdateMayOverflow = false;

for (ElementCount VF : Range)

IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

- Instruction *DLInst =

- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),

- DLInst ? DLInst->getDebugLoc() : DebugLoc(),

- CM.getTailFoldingStyle(IVUpdateMayOverflow));

+ DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);

+ // When not folding the tail, we know that the induction increment will not

+ // overflow.

+ bool HasNUW = Style == TailFoldingStyle::None;

+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);

+ // Proactively create header mask. Masks for other blocks are created on

+ // demand.

+ RecipeBuilder.createHeaderMask(*Plan);

// Scan the body of the loop in a topological order to visit each basic block

// after having visited its predecessor basic blocks.

@@ -9005,14 +8767,8 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// Introduce each ingredient into VPlan.

// TODO: Model and preserve debug intrinsics in VPlan.

- for (Instruction &I : BB->instructionsWithoutDebug(false)) {

+ for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {

Instruction *Instr = &I;

- // First filter out irrelevant instructions, to ensure no recipes are

- // built for them.

- if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))

- continue;

SmallVector<VPValue *, 4> Operands;

auto *Phi = dyn_cast<PHINode>(Instr);

if (Phi && Phi->getParent() == OrigLoop->getHeader()) {

@@ -9052,11 +8808,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

}

RecipeBuilder.setRecipe(Instr, Recipe);

- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&

- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {

- // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the

- // phi section of HeaderVPBB.

- assert(isa<TruncInst>(Instr));

+ if (isa<VPHeaderPHIRecipe>(Recipe)) {

+ // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In

+ // the following cases, VPHeaderPHIRecipes may be created after non-phi

+ // recipes and need to be moved to the phi section of HeaderVPBB:

+ // * tail-folding (non-phi recipes computing the header mask are

+ // introduced earlier than regular header phi recipes, and should appear

+ // after them)

+ // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.

+ assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||

+ CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&

+ "unexpected recipe needs moving");

Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());

} else

VPBB->appendRecipe(Recipe);

@@ -9074,7 +8837,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// and there is nothing to fix from vector loop; phis should have incoming

// from scalar loop only.

} else

- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);

+ addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);

assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&

!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&

@@ -9088,8 +8851,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// ---------------------------------------------------------------------------

// Adjust the recipes for any inloop reductions.

- adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,

- RecipeBuilder, Range.Start);

+ adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);

// Interleave memory: for each Interleave Group we marked earlier as relevant

// for this VPlan, replace the Recipes widening its memory instructions with a

@@ -9150,21 +8912,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// Sink users of fixed-order recurrence past the recipe defining the previous

// value and introduce FirstOrderRecurrenceSplice VPInstructions.

if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))

- return std::nullopt;

- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);

- VPlanTransforms::removeRedundantInductionCasts(*Plan);

- VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());

- VPlanTransforms::removeDeadRecipes(*Plan);

- VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);

- VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);

- VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);

+ return nullptr;

- assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");

- return std::make_optional(std::move(Plan));

+ if (useActiveLaneMask(Style)) {

+ // TODO: Move checks to VPlanTransforms::addActiveLaneMask once

+ // TailFoldingStyle is visible there.

+ bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);

+ bool WithoutRuntimeCheck =

+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

+ VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,

+ WithoutRuntimeCheck);

+ }

+ return Plan;

}

VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

@@ -9198,8 +8957,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();

Term->eraseFromParent();

- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),

- CM.getTailFoldingStyle());

+ // Tail folding is not supported for outer loops, so the induction increment

+ // is guaranteed to not wrap.

+ bool HasNUW = true;

+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,

+ DebugLoc());

return Plan;

}

@@ -9211,105 +8973,211 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

void LoopVectorizationPlanner::adjustRecipesForReductions(

VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,

ElementCount MinVF) {

- for (const auto &Reduction : CM.getInLoopReductionChains()) {

- PHINode *Phi = Reduction.first;

- const RecurrenceDescriptor &RdxDesc =

- Legal->getReductionVars().find(Phi)->second;

- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;

- if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))

+ VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();

+ // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores

+ // sank outside of the loop would keep the same order as they had in the

+ // original loop.

+ SmallVector<VPReductionPHIRecipe *> ReductionPHIList;

+ for (VPRecipeBase &R : Header->phis()) {

+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))

+ ReductionPHIList.emplace_back(ReductionPhi);

+ }

+ bool HasIntermediateStore = false;

+ stable_sort(ReductionPHIList,

+ [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,

+ const VPReductionPHIRecipe *R2) {

+ auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;

+ auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;

+ HasIntermediateStore |= IS1 || IS2;

+ // If neither of the recipes has an intermediate store, keep the

+ // order the same.

+ if (!IS1 && !IS2)

+ return false;

+ // If only one of the recipes has an intermediate store, then

+ // move it towards the beginning of the list.

+ if (IS1 && !IS2)

+ return true;

+ if (!IS1 && IS2)

+ return false;

+ // If both recipes have an intermediate store, then the recipe

+ // with the later store should be processed earlier. So it

+ // should go to the beginning of the list.

+ return DT->dominates(IS2, IS1);

+ });

+ if (HasIntermediateStore && ReductionPHIList.size() > 1)

+ for (VPRecipeBase *R : ReductionPHIList)

+ R->moveBefore(*Header, Header->getFirstNonPhi());

+ SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;

+ for (VPRecipeBase &R : Header->phis()) {

+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

+ if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))

continue;

+ InLoopReductionPhis.push_back(PhiR);

+ }

+ for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {

+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

+ RecurKind Kind = RdxDesc.getRecurrenceKind();

+ assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&

+ "AnyOf reductions are not allowed for in-loop reductions");

+ // Collect the chain of "link" recipes for the reduction starting at PhiR.

+ SetVector<VPRecipeBase *> Worklist;

+ Worklist.insert(PhiR);

+ for (unsigned I = 0; I != Worklist.size(); ++I) {

+ VPRecipeBase *Cur = Worklist[I];

+ for (VPUser *U : Cur->getVPSingleValue()->users()) {

+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);

+ if (!UserRecipe)

+ continue;

+ assert(UserRecipe->getNumDefinedValues() == 1 &&

+ "recipes must define exactly one result value");

+ Worklist.insert(UserRecipe);

+ }

+ // Visit operation "Links" along the reduction chain top-down starting from

+ // the phi until LoopExitValue. We keep track of the previous item

+ // (PreviousLink) to tell which of the two operands of a Link will remain

+ // scalar and which will be reduced. For minmax by select(cmp), Link will be

+ // the select instructions.

+ VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].

+ for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {

+ VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();

+ Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();

- // ReductionOperations are orders top-down from the phi's use to the

- // LoopExitValue. We keep a track of the previous item (the Chain) to tell

- // which of the two operands will remain scalar and which will be reduced.

- // For minmax the chain will be the select instructions.

- Instruction *Chain = Phi;

- for (Instruction *R : ReductionOperations) {

- VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);

- RecurKind Kind = RdxDesc.getRecurrenceKind();

- VPValue *ChainOp = Plan->getVPValue(Chain);

- unsigned FirstOpId;

- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&

- "Only min/max recurrences allowed for inloop reductions");

+ // Index of the first operand which holds a non-mask vector operand.

+ unsigned IndexOfFirstOperand;

// Recognize a call to the llvm.fmuladd intrinsic.

bool IsFMulAdd = (Kind == RecurKind::FMulAdd);

- assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&

- "Expected instruction to be a call to the llvm.fmuladd intrinsic");

- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

- assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&

- "Expected to replace a VPWidenSelectSC");

- FirstOpId = 1;

+ VPValue *VecOp;

+ VPBasicBlock *LinkVPBB = CurrentLink->getParent();

+ if (IsFMulAdd) {

+ assert(

+ RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&

+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");

+ assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||

+ isa<VPWidenCallRecipe>(CurrentLink)) &&

+ CurrentLink->getOperand(2) == PreviousLinkV &&

+ "expected a call where the previous link is the added operand");

+ // If the instruction is a call to the llvm.fmuladd intrinsic then we

+ // need to create an fmul recipe (multiplying the first two operands of

+ // the fmuladd together) to use as the vector operand for the fadd

+ // reduction.

+ VPInstruction *FMulRecipe = new VPInstruction(

+ Instruction::FMul,

+ {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},

+ CurrentLinkI->getFastMathFlags());

+ LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());

+ VecOp = FMulRecipe;

} else {

- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||

- (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&

- "Expected to replace a VPWidenSC");

- FirstOpId = 0;

+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

+ if (isa<VPWidenRecipe>(CurrentLink)) {

+ assert(isa<CmpInst>(CurrentLinkI) &&

+ "need to have the compare of the select");

+ continue;

+ }

+ assert(isa<VPWidenSelectRecipe>(CurrentLink) &&

+ "must be a select recipe");

+ IndexOfFirstOperand = 1;

+ } else {

+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&

+ "Expected to replace a VPWidenSC");

+ IndexOfFirstOperand = 0;

+ }

+ // Note that for non-commutable operands (cmp-selects), the semantics of

+ // the cmp-select are captured in the recurrence kind.

+ unsigned VecOpId =

+ CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV

+ ? IndexOfFirstOperand + 1

+ : IndexOfFirstOperand;

+ VecOp = CurrentLink->getOperand(VecOpId);

+ assert(VecOp != PreviousLinkV &&

+ CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -

+ (VecOpId - IndexOfFirstOperand)) ==

+ PreviousLinkV &&

+ "PreviousLinkV must be the operand other than VecOp");

}

- unsigned VecOpId =

- R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;

- VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));

+ BasicBlock *BB = CurrentLinkI->getParent();

VPValue *CondOp = nullptr;

- if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {

+ if (CM.blockNeedsPredicationForAnyReason(BB)) {

VPBuilder::InsertPointGuard Guard(Builder);

- Builder.setInsertPoint(WidenRecipe->getParent(),

- WidenRecipe->getIterator());

- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);

+ Builder.setInsertPoint(CurrentLink);

+ CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);

}

- if (IsFMulAdd) {

- // If the instruction is a call to the llvm.fmuladd intrinsic then we

- // need to create an fmul recipe to use as the vector operand for the

- // fadd reduction.

- VPInstruction *FMulRecipe = new VPInstruction(

- Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});

- FMulRecipe->setFastMathFlags(R->getFastMathFlags());

- WidenRecipe->getParent()->insert(FMulRecipe,

- WidenRecipe->getIterator());

- VecOp = FMulRecipe;

- }

- VPReductionRecipe *RedRecipe =

- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);

- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

- Plan->removeVPValueFor(R);

- Plan->addVPValue(R, RedRecipe);

+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(

+ RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);

// Append the recipe to the end of the VPBasicBlock because we need to

// ensure that it comes after all of it's inputs, including CondOp.

- WidenRecipe->getParent()->appendRecipe(RedRecipe);

- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

- WidenRecipe->eraseFromParent();

- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

- VPRecipeBase *CompareRecipe =

- RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));

- assert(isa<VPWidenRecipe>(CompareRecipe) &&

- "Expected to replace a VPWidenSC");

- assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&

- "Expected no remaining users");

- CompareRecipe->eraseFromParent();

- }

- Chain = R;

+ // Note that this transformation may leave over dead recipes (including

+ // CurrentLink), which will be cleaned by a later VPlan transform.

+ LinkVPBB->appendRecipe(RedRecipe);

+ CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

+ PreviousLink = RedRecipe;

}

- // If tail is folded by masking, introduce selects between the phi

- // and the live-out instruction of each reduction, at the beginning of the

- // dedicated latch block.

- if (CM.foldTailByMasking()) {

- Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());

+ Builder.setInsertPoint(&*LatchVPBB->begin());

for (VPRecipeBase &R :

Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

- VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

- if (!PhiR || PhiR->isInLoop())

- continue;

+ VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

+ if (!PhiR || PhiR->isInLoop())

+ continue;

+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

+ auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();

+ // If tail is folded by masking, introduce selects between the phi

+ // and the live-out instruction of each reduction, at the beginning of the

+ // dedicated latch block.

+ if (CM.foldTailByMasking()) {

VPValue *Cond =

RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);

VPValue *Red = PhiR->getBackedgeValue();

assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&

"reduction recipe must be defined before latch");

- Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});

+ FastMathFlags FMFs = RdxDesc.getFastMathFlags();

+ Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();

+ Result =

+ PhiTy->isFloatingPointTy()

+ ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)

+ : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});

+ Result->insertBefore(&*Builder.getInsertPoint());

+ Red->replaceUsesWithIf(

+ Result->getVPSingleValue(),

+ [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });

+ if (PreferPredicatedReductionSelect ||

+ TTI.preferPredicatedReductionSelect(

+ PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,

+ TargetTransformInfo::ReductionFlags()))

+ PhiR->setOperand(1, Result->getVPSingleValue());

+ }

+ // If the vector reduction can be performed in a smaller type, we truncate

+ // then extend the loop exit value to enable InstCombine to evaluate the

+ // entire expression in the smaller type.

+ Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();

+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {

+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");

+ Type *RdxTy = RdxDesc.getRecurrenceType();

+ auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,

+ Result->getVPSingleValue(), RdxTy);

+ auto *Extnd =

+ RdxDesc.isSigned()

+ ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)

+ : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);

+ Trunc->insertAfter(Result);

+ Extnd->insertAfter(Trunc);

+ Result->getVPSingleValue()->replaceAllUsesWith(Extnd);

+ Trunc->setOperand(0, Result->getVPSingleValue());

}

@@ -9347,107 +9215,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,

}

#endif

-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

- assert(!State.Instance && "Int or FP induction being replicated.");

- Value *Start = getStartValue()->getLiveInIRValue();

- const InductionDescriptor &ID = getInductionDescriptor();

- TruncInst *Trunc = getTruncInst();

- IRBuilderBase &Builder = State.Builder;

- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");

- assert(State.VF.isVector() && "must have vector VF");

- // The value from the original loop to which we are mapping the new induction

- // variable.

- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

- // Fast-math-flags propagate from the original induction instruction.

- IRBuilder<>::FastMathFlagGuard FMFG(Builder);

- if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))

- Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());

- // Now do the actual transformations, and start with fetching the step value.

- Value *Step = State.get(getStepValue(), VPIteration(0, 0));

- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&

- "Expected either an induction phi-node or a truncate of it!");

- // Construct the initial value of the vector IV in the vector loop preheader

- auto CurrIP = Builder.saveIP();

- BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

- Builder.SetInsertPoint(VectorPH->getTerminator());

- if (isa<TruncInst>(EntryVal)) {

- assert(Start->getType()->isIntegerTy() &&

- "Truncation requires an integer type");

- auto *TruncType = cast<IntegerType>(EntryVal->getType());

- Step = Builder.CreateTrunc(Step, TruncType);

- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);

- }

- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);

- Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);

- Value *SteppedStart = getStepVector(

- SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);

- // We create vector phi nodes for both integer and floating-point induction

- // variables. Here, we determine the kind of arithmetic we will perform.

- Instruction::BinaryOps AddOp;

- Instruction::BinaryOps MulOp;

- if (Step->getType()->isIntegerTy()) {

- AddOp = Instruction::Add;

- MulOp = Instruction::Mul;

- } else {

- AddOp = ID.getInductionOpcode();

- MulOp = Instruction::FMul;

- }

- // Multiply the vectorization factor by the step using integer or

- // floating-point arithmetic as appropriate.

- Type *StepType = Step->getType();

- Value *RuntimeVF;

- if (Step->getType()->isFloatingPointTy())

- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);

- else

- RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);

- Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);

- // Create a vector splat to use in the induction update.

- //

- // FIXME: If the step is non-constant, we create the vector splat with

- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't

- // handle a constant vector splat.

- Value *SplatVF = isa<Constant>(Mul)

- ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))

- : Builder.CreateVectorSplat(State.VF, Mul);

- Builder.restoreIP(CurrIP);

- // We may need to add the step a number of times, depending on the unroll

- // factor. The last of those goes into the PHI.

- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",

- &*State.CFG.PrevBB->getFirstInsertionPt());

- VecInd->setDebugLoc(EntryVal->getDebugLoc());

- Instruction *LastInduction = VecInd;

- for (unsigned Part = 0; Part < State.UF; ++Part) {

- State.set(this, LastInduction, Part);

- if (isa<TruncInst>(EntryVal))

- State.addMetadata(LastInduction, EntryVal);

- LastInduction = cast<Instruction>(

- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));

- LastInduction->setDebugLoc(EntryVal->getDebugLoc());

- }

- LastInduction->setName("vec.ind.next");

- VecInd->addIncoming(SteppedStart, VectorPH);

- // Add induction update using an incorrect block temporarily. The phi node

- // will be fixed after VPlan execution. Note that at this point the latch

- // block cannot be used, as it does not exist yet.

- // TODO: Model increment value in VPlan, by turning the recipe into a

- // multi-def and a subclass of VPHeaderPHIRecipe.

- VecInd->addIncoming(LastInduction, VectorPH);

void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&

"Not a pointer induction according to InductionDescriptor!");

@@ -9480,7 +9247,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {

Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));

Value *SclrGep = emitTransformedIndex(

- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);

+ State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,

+ IndDesc.getKind(), IndDesc.getInductionBinOp());

SclrGep->setName("next.gep");

State.set(this, SclrGep, VPIteration(Part, Lane));

}

@@ -9547,41 +9315,26 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {

// Fast-math-flags propagate from the original induction instruction.

IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

- if (IndDesc.getInductionBinOp() &&

- isa<FPMathOperator>(IndDesc.getInductionBinOp()))

- State.Builder.setFastMathFlags(

- IndDesc.getInductionBinOp()->getFastMathFlags());

+ if (FPBinOp)

+ State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());

Value *Step = State.get(getStepValue(), VPIteration(0, 0));

Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));

- Value *DerivedIV =

- emitTransformedIndex(State.Builder, CanonicalIV,

- getStartValue()->getLiveInIRValue(), Step, IndDesc);

+ Value *DerivedIV = emitTransformedIndex(

+ State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,

+ Kind, cast_if_present<BinaryOperator>(FPBinOp));

DerivedIV->setName("offset.idx");

- if (ResultTy != DerivedIV->getType()) {

- assert(Step->getType()->isIntegerTy() &&

+ if (TruncResultTy) {

+ assert(TruncResultTy != DerivedIV->getType() &&

+ Step->getType()->isIntegerTy() &&

"Truncation requires an integer step");

- DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);

+ DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);

}

assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");

State.set(this, DerivedIV, VPIteration(0, 0));

}

-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

- // Fast-math-flags propagate from the original induction instruction.

- IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

- if (IndDesc.getInductionBinOp() &&

- isa<FPMathOperator>(IndDesc.getInductionBinOp()))

- State.Builder.setFastMathFlags(

- IndDesc.getInductionBinOp()->getFastMathFlags());

- Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));

- Value *Step = State.get(getStepValue(), VPIteration(0, 0));

- buildScalarSteps(BaseIV, Step, IndDesc, this, State);

void VPInterleaveRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Interleave group being replicated.");

State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),

@@ -9592,48 +9345,51 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {

void VPReductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Reduction being replicated.");

Value *PrevInChain = State.get(getChainOp(), 0);

- RecurKind Kind = RdxDesc->getRecurrenceKind();

- bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);

+ RecurKind Kind = RdxDesc.getRecurrenceKind();

+ bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);

// Propagate the fast-math flags carried by the underlying instruction.

IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);

- State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());

+ State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());

for (unsigned Part = 0; Part < State.UF; ++Part) {

Value *NewVecOp = State.get(getVecOp(), Part);

if (VPValue *Cond = getCondOp()) {

- Value *NewCond = State.get(Cond, Part);

- VectorType *VecTy = cast<VectorType>(NewVecOp->getType());

- Value *Iden = RdxDesc->getRecurrenceIdentity(

- Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());

- Value *IdenVec =

- State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);

- Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);

+ Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)

+ : State.get(Cond, {Part, 0});

+ VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());

+ Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();

+ Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,

+ RdxDesc.getFastMathFlags());

+ if (State.VF.isVector()) {

+ Iden =

+ State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);

+ }

+ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);

NewVecOp = Select;

}

Value *NewRed;

Value *NextInChain;

if (IsOrdered) {

if (State.VF.isVector())

- NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,

+ NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,

PrevInChain);

else

NewRed = State.Builder.CreateBinOp(

- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,

+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,

NewVecOp);

PrevInChain = NewRed;

} else {

PrevInChain = State.get(getChainOp(), Part);

- NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);

+ NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);

}

if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

- NextInChain =

- createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),

- NewRed, PrevInChain);

+ NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),

+ NewRed, PrevInChain);

} else if (IsOrdered)

NextInChain = NewRed;

else

NextInChain = State.Builder.CreateBinOp(

- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,

- PrevInChain);

+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);

State.set(this, NextInChain, Part);

}

@@ -9652,7 +9408,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

VectorType::get(UI->getType(), State.VF));

State.set(this, Poison, State.Instance->Part);

}

- State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);

+ State.packScalarIntoVectorValue(this, *State.Instance);

}

return;

}

@@ -9718,9 +9474,16 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

auto &Builder = State.Builder;

InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);

bool isMaskRequired = getMask();

- if (isMaskRequired)

- for (unsigned Part = 0; Part < State.UF; ++Part)

- BlockInMaskParts[Part] = State.get(getMask(), Part);

+ if (isMaskRequired) {

+ // Mask reversal is only neede for non-all-one (null) masks, as reverse of a

+ // null all-one mask is a null mask.

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *Mask = State.get(getMask(), Part);

+ if (isReverse())

+ Mask = Builder.CreateVectorReverse(Mask, "reverse");

+ BlockInMaskParts[Part] = Mask;

+ }

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

// Calculate the pointer for the specific unroll-part.

@@ -9731,7 +9494,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

const DataLayout &DL =

Builder.GetInsertBlock()->getModule()->getDataLayout();

Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)

- ? DL.getIndexType(ScalarDataTy->getPointerTo())

+ ? DL.getIndexType(PointerType::getUnqual(

+ ScalarDataTy->getContext()))

: Builder.getInt32Ty();

bool InBounds = false;

if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

@@ -9751,21 +9515,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);

PartPtr =

Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);

- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

- BlockInMaskParts[Part] =

- Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");

} else {

Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);

PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);

}

- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

+ return PartPtr;

};

// Handle Stores:

if (SI) {

- State.setDebugLocFromInst(SI);

+ State.setDebugLocFrom(SI->getDebugLoc());

for (unsigned Part = 0; Part < State.UF; ++Part) {

Instruction *NewSI = nullptr;

@@ -9798,7 +9558,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

// Handle loads.

assert(LI && "Must have a load instruction");

- State.setDebugLocFromInst(LI);

+ State.setDebugLocFrom(LI->getDebugLoc());

for (unsigned Part = 0; Part < State.UF; ++Part) {

Value *NewLI;

if (CreateGatherScatter) {

@@ -9877,95 +9637,6 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(

return CM_ScalarEpilogueAllowed;

}

-Value *VPTransformState::get(VPValue *Def, unsigned Part) {

- // If Values have been set for this Def return the one relevant for \p Part.

- if (hasVectorValue(Def, Part))

- return Data.PerPartOutput[Def][Part];

- auto GetBroadcastInstrs = [this, Def](Value *V) {

- bool SafeToHoist = Def->isDefinedOutsideVectorRegions();

- if (VF.isScalar())

- return V;

- // Place the code for broadcasting invariant variables in the new preheader.

- IRBuilder<>::InsertPointGuard Guard(Builder);

- if (SafeToHoist) {

- BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(

- Plan->getVectorLoopRegion()->getSinglePredecessor())];

- if (LoopVectorPreHeader)

- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

- }

- // Place the code for broadcasting invariant variables in the new preheader.

- // Broadcast the scalar into all locations in the vector.

- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

- return Shuf;

- };

- if (!hasScalarValue(Def, {Part, 0})) {

- Value *IRV = Def->getLiveInIRValue();

- Value *B = GetBroadcastInstrs(IRV);

- set(Def, B, Part);

- return B;

- }

- Value *ScalarValue = get(Def, {Part, 0});

- // If we aren't vectorizing, we can just copy the scalar map values over

- // to the vector map.

- if (VF.isScalar()) {

- set(Def, ScalarValue, Part);

- return ScalarValue;

- }

- bool IsUniform = vputils::isUniformAfterVectorization(Def);

- unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;

- // Check if there is a scalar value for the selected lane.

- if (!hasScalarValue(Def, {Part, LastLane})) {

- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and

- // VPExpandSCEVRecipes can also be uniform.

- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||

- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||

- isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&

- "unexpected recipe found to be invariant");

- IsUniform = true;

- LastLane = 0;

- }

- auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));

- // Set the insert point after the last scalarized instruction or after the

- // last PHI, if LastInst is a PHI. This ensures the insertelement sequence

- // will directly follow the scalar definitions.

- auto OldIP = Builder.saveIP();

- auto NewIP =

- isa<PHINode>(LastInst)

- ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())

- : std::next(BasicBlock::iterator(LastInst));

- Builder.SetInsertPoint(&*NewIP);

- // However, if we are vectorizing, we need to construct the vector values.

- // If the value is known to be uniform after vectorization, we can just

- // broadcast the scalar value corresponding to lane zero for each unroll

- // iteration. Otherwise, we construct the vector values using

- // insertelement instructions. Since the resulting vectors are stored in

- // State, we will only generate the insertelements once.

- Value *VectorValue = nullptr;

- if (IsUniform) {

- VectorValue = GetBroadcastInstrs(ScalarValue);

- set(Def, VectorValue, Part);

- } else {

- // Initialize packing with insertelements to start from undef.

- assert(!VF.isScalable() && "VF is assumed to be non scalable.");

- Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));

- set(Def, Undef, Part);

- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)

- ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);

- VectorValue = get(Def, Part);

- }

- Builder.restoreIP(OldIP);

- return VectorValue;

// Process the loop in the VPlan-native vectorization path. This path builds

// VPlan upfront in the vectorization pipeline, which allows to apply

// VPlan-to-VPlan transformations from the very beginning without modifying the

@@ -9994,7 +9665,8 @@ static bool processLoopInVPlanNativePath(

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

// optional argument if we don't need it in the future.

- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);

+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,

+ ORE);

// Get user vectorization factor.

ElementCount UserVF = Hints.getWidth();

@@ -10013,8 +9685,10 @@ static bool processLoopInVPlanNativePath(

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);

{

+ bool AddBranchWeights =

+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

- F->getParent()->getDataLayout());

+ F->getParent()->getDataLayout(), AddBranchWeights);

InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,

VF.Width, 1, LVL, &CM, BFI, PSI, Checks);

LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""

@@ -10022,6 +9696,8 @@ static bool processLoopInVPlanNativePath(

LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);

}

+ reportVectorization(ORE, L, VF, 1);

// Mark the loop as already vectorized to avoid vectorizing again.

Hints.setAlreadyVectorized();

assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));

@@ -10076,7 +9752,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

VectorizationFactor &VF,

std::optional<unsigned> VScale, Loop *L,

- ScalarEvolution &SE) {

+ ScalarEvolution &SE,

+ ScalarEpilogueLowering SEL) {

InstructionCost CheckCost = Checks.getCost();

if (!CheckCost.isValid())

return false;

@@ -10146,11 +9823,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC

double MinTC2 = RtC * 10 / ScalarC;

- // Now pick the larger minimum. If it is not a multiple of VF, choose the

- // next closest multiple of VF. This should partly compensate for ignoring

- // the epilogue cost.

+ // Now pick the larger minimum. If it is not a multiple of VF and a scalar

+ // epilogue is allowed, choose the next closest multiple of VF. This should

+ // partly compensate for ignoring the epilogue cost.

uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));

- VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));

+ if (SEL == CM_ScalarEpilogueAllowed)

+ MinTC = alignTo(MinTC, IntVF);

+ VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);

LLVM_DEBUG(

dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"

@@ -10270,7 +9949,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {

else {

if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {

LLVM_DEBUG(dbgs() << "\n");

- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;

+ // Predicate tail-folded loops are efficient even when the loop

+ // iteration count is low. However, setting the epilogue policy to

+ // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops

+ // with runtime checks. It's more effective to let

+ // `areRuntimeChecksProfitable` determine if vectorization is beneficial

+ // for the loop.

+ if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)

+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;

} else {

LLVM_DEBUG(dbgs() << " But the target considers the trip count too "

"small to consider vectorizing.\n");

@@ -10334,7 +10020,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,

F, &Hints, IAI);

// Use the planner for vectorization.

- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,

+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,

ORE);

// Get user vectorization factor and interleave count.

@@ -10347,8 +10033,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VectorizationFactor VF = VectorizationFactor::Disabled();

unsigned IC = 1;

+ bool AddBranchWeights =

+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,

- F->getParent()->getDataLayout());

+ F->getParent()->getDataLayout(), AddBranchWeights);

if (MaybeVF) {

VF = *MaybeVF;

// Select the interleave count.

@@ -10365,7 +10053,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

Hints.getForce() == LoopVectorizeHints::FK_Enabled;

if (!ForceVectorization &&

!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,

- *PSE.getSE())) {

+ *PSE.getSE(), SEL)) {

ORE->emit([&]() {

return OptimizationRemarkAnalysisAliasing(

DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

@@ -10587,13 +10275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

DisableRuntimeUnroll = true;

}

// Report the vectorization decision.

- ORE->emit([&]() {

- return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),

- L->getHeader())

- << "vectorized loop (vectorization width: "

- << NV("VectorizationFactor", VF.Width)

- << ", interleaved count: " << NV("InterleaveCount", IC) << ")";

- });

+ reportVectorization(ORE, L, VF, IC);

}

if (ORE->allowExtraAnalysis(LV_NAME))

@@ -10676,8 +10358,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(

Changed |= CFGChanged |= processLoop(L);

- if (Changed)

+ if (Changed) {

LAIs->clear();

+#ifndef NDEBUG

+ if (VerifySCEV)

+ SE->verify();

+#endif

+ }

}

// Process each loop nest in the function.

@@ -10725,10 +10413,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

PA.preserve<LoopAnalysis>();

PA.preserve<DominatorTreeAnalysis>();

PA.preserve<ScalarEvolutionAnalysis>();

-#ifdef EXPENSIVE_CHECKS

- SE.verify();

-#endif

}

if (Result.MadeCFGChange) {

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9870ffbb586c..9d799124074c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -19,7 +19,6 @@

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/DenseSet.h"

-#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/PriorityQueue.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetOperations.h"

@@ -34,6 +33,7 @@

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/CodeMetrics.h"

+#include "llvm/Analysis/ConstantFolding.h"

#include "llvm/Analysis/DemandedBits.h"

#include "llvm/Analysis/GlobalsModRef.h"

#include "llvm/Analysis/IVDescriptors.h"

@@ -97,7 +97,6 @@

#include <string>

#include <tuple>

#include <utility>

-#include <vector>

using namespace llvm;

using namespace llvm::PatternMatch;

@@ -108,8 +107,9 @@ using namespace slpvectorizer;

STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

- cl::desc("Run the SLP vectorization passes"));

+static cl::opt<bool>

+ RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

+ cl::desc("Run the SLP vectorization passes"));

static cl::opt<int>

SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

@@ -140,10 +140,6 @@ static cl::opt<unsigned>

MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,

cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

-static cl::opt<int>

-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,

- cl::desc("Maximum depth of the lookup for consecutive stores."));

/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

@@ -232,6 +228,17 @@ static bool isVectorLikeInstWithConstOps(Value *V) {

return isConstant(I->getOperand(2));

}

+#if !defined(NDEBUG)

+/// Print a short descriptor of the instruction bundle suitable for debug output.

+static std::string shortBundleName(ArrayRef<Value *> VL) {

+ std::string Result;

+ raw_string_ostream OS(Result);

+ OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";

+ OS.flush();

+ return Result;

+#endif

/// \returns true if all of the instructions in \p VL are in the same block or

/// false otherwise.

static bool allSameBlock(ArrayRef<Value *> VL) {

@@ -384,8 +391,10 @@ static SmallBitVector isUndefVector(const Value *V,

if (isa<T>(II->getOperand(1)))

continue;

std::optional<unsigned> Idx = getInsertIndex(II);

- if (!Idx)

- continue;

+ if (!Idx) {

+ Res.reset();

+ return Res;

+ }

if (*Idx < UseMask.size() && !UseMask.test(*Idx))

Res.reset(*Idx);

}

@@ -429,26 +438,6 @@ static SmallBitVector isUndefVector(const Value *V,

/// i32 6>

/// %2 = mul <4 x i8> %1, %1

/// ret <4 x i8> %2

-/// We convert this initially to something like:

-/// %x0 = extractelement <4 x i8> %x, i32 0

-/// %x3 = extractelement <4 x i8> %x, i32 3

-/// %y1 = extractelement <4 x i8> %y, i32 1

-/// %y2 = extractelement <4 x i8> %y, i32 2

-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0

-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1

-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2

-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3

-/// %5 = mul <4 x i8> %4, %4

-/// %6 = extractelement <4 x i8> %5, i32 0

-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0

-/// %7 = extractelement <4 x i8> %5, i32 1

-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1

-/// %8 = extractelement <4 x i8> %5, i32 2

-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2

-/// %9 = extractelement <4 x i8> %5, i32 3

-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3

-/// ret <4 x i8> %ins4

-/// InstCombiner transforms this into a shuffle and vector mul

/// Mask will return the Shuffle Mask equivalent to the extracted elements.

/// TODO: Can we split off and reuse the shuffle mask detection from

/// ShuffleVectorInst/getShuffleCost?

@@ -539,117 +528,6 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {

return *EI->idx_begin();

}

-/// Tries to find extractelement instructions with constant indices from fixed

-/// vector type and gather such instructions into a bunch, which highly likely

-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

-/// successful, the matched scalars are replaced by poison values in \p VL for

-/// future analysis.

-static std::optional<TTI::ShuffleKind>

-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

- SmallVectorImpl<int> &Mask) {

- // Scan list of gathered scalars for extractelements that can be represented

- // as shuffles.

- MapVector<Value *, SmallVector<int>> VectorOpToIdx;

- SmallVector<int> UndefVectorExtracts;

- for (int I = 0, E = VL.size(); I < E; ++I) {

- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

- if (!EI) {

- if (isa<UndefValue>(VL[I]))

- UndefVectorExtracts.push_back(I);

- continue;

- }

- auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

- if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

- continue;

- std::optional<unsigned> Idx = getExtractIndex(EI);

- // Undefined index.

- if (!Idx) {

- UndefVectorExtracts.push_back(I);

- continue;

- }

- SmallBitVector ExtractMask(VecTy->getNumElements(), true);

- ExtractMask.reset(*Idx);

- if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

- UndefVectorExtracts.push_back(I);

- continue;

- }

- VectorOpToIdx[EI->getVectorOperand()].push_back(I);

- }

- // Sort the vector operands by the maximum number of uses in extractelements.

- MapVector<unsigned, SmallVector<Value *>> VFToVector;

- for (const auto &Data : VectorOpToIdx)

- VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]

- .push_back(Data.first);

- for (auto &Data : VFToVector) {

- stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {

- return VectorOpToIdx.find(V1)->second.size() >

- VectorOpToIdx.find(V2)->second.size();

- });

- }

- // Find the best pair of the vectors with the same number of elements or a

- // single vector.

- const int UndefSz = UndefVectorExtracts.size();

- unsigned SingleMax = 0;

- Value *SingleVec = nullptr;

- unsigned PairMax = 0;

- std::pair<Value *, Value *> PairVec(nullptr, nullptr);

- for (auto &Data : VFToVector) {

- Value *V1 = Data.second.front();

- if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {

- SingleMax = VectorOpToIdx[V1].size() + UndefSz;

- SingleVec = V1;

- }

- Value *V2 = nullptr;

- if (Data.second.size() > 1)

- V2 = *std::next(Data.second.begin());

- if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +

- UndefSz) {

- PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;

- PairVec = std::make_pair(V1, V2);

- }

- if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

- return std::nullopt;

- // Check if better to perform a shuffle of 2 vectors or just of a single

- // vector.

- SmallVector<Value *> SavedVL(VL.begin(), VL.end());

- SmallVector<Value *> GatheredExtracts(

- VL.size(), PoisonValue::get(VL.front()->getType()));

- if (SingleMax >= PairMax && SingleMax) {

- for (int Idx : VectorOpToIdx[SingleVec])

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- } else {

- for (Value *V : {PairVec.first, PairVec.second})

- for (int Idx : VectorOpToIdx[V])

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- }

- // Add extracts from undefs too.

- for (int Idx : UndefVectorExtracts)

- std::swap(GatheredExtracts[Idx], VL[Idx]);

- // Check that gather of extractelements can be represented as just a

- // shuffle of a single/two vectors the scalars are extracted from.

- std::optional<TTI::ShuffleKind> Res =

- isFixedVectorShuffle(GatheredExtracts, Mask);

- if (!Res) {

- // TODO: try to check other subsets if possible.

- // Restore the original VL if attempt was not successful.

- VL.swap(SavedVL);

- return std::nullopt;

- }

- // Restore unused scalars from mask, if some of the extractelements were not

- // selected for shuffle.

- for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

- auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

- if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

- !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

- is_contained(UndefVectorExtracts, I))

- continue;

- if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))

- std::swap(VL[I], GatheredExtracts[I]);

- }

- return Res;

namespace {

/// Main data required for vectorization of instructions.

@@ -695,7 +573,7 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {

return S.OpValue;

}

-/// \returns true if \p Opcode is allowed as part of of the main/alternate

+/// \returns true if \p Opcode is allowed as part of the main/alternate

/// instruction for SLP vectorization.

///

/// Example of unsupported opcode is SDIV that can potentially cause UB if the

@@ -889,18 +767,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

/// \returns true if all of the values in \p VL have the same type or false

/// otherwise.

static bool allSameType(ArrayRef<Value *> VL) {

- Type *Ty = VL[0]->getType();

- for (int i = 1, e = VL.size(); i < e; i++)

- if (VL[i]->getType() != Ty)

- return false;

- return true;

+ Type *Ty = VL.front()->getType();

+ return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });

}

/// \returns True if in-tree use also needs extract. This refers to

/// possible scalar operand in vectorized instruction.

-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

- TargetLibraryInfo *TLI) {

+static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

+ TargetLibraryInfo *TLI) {

unsigned Opcode = UserInst->getOpcode();

switch (Opcode) {

case Instruction::Load: {

@@ -914,11 +788,10 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

case Instruction::Call: {

CallInst *CI = cast<CallInst>(UserInst);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

- return (CI->getArgOperand(i) == Scalar);

- }

- [[fallthrough]];

+ return any_of(enumerate(CI->args()), [&](auto &&Arg) {

+ return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&

+ Arg.value().get() == Scalar;

+ });

}

default:

return false;

@@ -1181,6 +1054,7 @@ public:

void deleteTree() {

VectorizableTree.clear();

ScalarToTreeEntry.clear();

+ MultiNodeScalars.clear();

MustGather.clear();

EntryToLastInstruction.clear();

ExternalUses.clear();

@@ -1273,7 +1147,7 @@ public:

/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

///

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

- unsigned canMapToVector(Type *T, const DataLayout &DL) const;

+ unsigned canMapToVector(Type *T) const;

/// \returns True if the VectorizableTree is both tiny and not fully

/// vectorizable. We do not vectorize such trees.

@@ -1324,6 +1198,9 @@ public:

}

LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }

#endif

+ bool operator == (const EdgeInfo &Other) const {

+ return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;

+ }

};

/// A helper class used for scoring candidates for two consecutive lanes.

@@ -1764,7 +1641,7 @@ public:

auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

return 0;

- return R.areAllUsersVectorized(IdxLaneI, std::nullopt)

+ return R.areAllUsersVectorized(IdxLaneI)

? LookAheadHeuristics::ScoreAllUserVectorized

: 0;

}

@@ -1941,7 +1818,7 @@ public:

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

} else if (NumFreeOpsHash.NumOfAPOs == Min &&

NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

- auto It = HashMap.find(NumFreeOpsHash.Hash);

+ auto *It = HashMap.find(NumFreeOpsHash.Hash);

if (It == HashMap.end())

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

else

@@ -2203,7 +2080,7 @@ public:

for (int Pass = 0; Pass != 2; ++Pass) {

// Check if no need to reorder operands since they're are perfect or

// shuffled diamond match.

- // Need to to do it to avoid extra external use cost counting for

+ // Need to do it to avoid extra external use cost counting for

// shuffled matches, which may cause regressions.

if (SkipReordering())

break;

@@ -2388,6 +2265,18 @@ public:

~BoUpSLP();

private:

+ /// Determine if a vectorized value \p V in can be demoted to

+ /// a smaller type with a truncation. We collect the values that will be

+ /// demoted in ToDemote and additional roots that require investigating in

+ /// Roots.

+ /// \param DemotedConsts list of Instruction/OperandIndex pairs that are

+ /// constant and to be demoted. Required to correctly identify constant nodes

+ /// to be demoted.

+ bool collectValuesToDemote(

+ Value *V, SmallVectorImpl<Value *> &ToDemote,

+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,

+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;

/// Check if the operands on the edges \p Edges of the \p UserTE allows

/// reordering (i.e. the operands can be reordered because they have only one

/// user and reordarable).

@@ -2410,12 +2299,25 @@ private:

TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {

ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

TreeEntry *TE = nullptr;

- const auto *It = find_if(VL, [this, &TE](Value *V) {

+ const auto *It = find_if(VL, [&](Value *V) {

TE = getTreeEntry(V);

- return TE;

+ if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))

+ return true;

+ auto It = MultiNodeScalars.find(V);

+ if (It != MultiNodeScalars.end()) {

+ for (TreeEntry *E : It->second) {

+ if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {

+ TE = E;

+ return true;

+ }

+ return false;

});

- if (It != VL.end() && TE->isSame(VL))

+ if (It != VL.end()) {

+ assert(TE->isSame(VL) && "Expected same scalars.");

return TE;

+ }

return nullptr;

}

@@ -2428,13 +2330,16 @@ private:

}

/// Checks if all users of \p I are the part of the vectorization tree.

- bool areAllUsersVectorized(Instruction *I,

- ArrayRef<Value *> VectorizedVals) const;

+ bool areAllUsersVectorized(

+ Instruction *I,

+ const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;

/// Return information about the vector formed for the specified index

/// of a vector of (the same) instruction.

- TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,

- unsigned OpIdx);

+ TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);

+ /// \ returns the graph entry for the \p Idx operand of the \p E entry.

+ const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;

/// \returns the cost of the vectorizable entry.

InstructionCost getEntryCost(const TreeEntry *E,

@@ -2450,15 +2355,22 @@ private:

/// vector) and sets \p CurrentOrder to the identity permutation; otherwise

/// returns false, setting \p CurrentOrder to either an empty vector or a

/// non-identity permutation that allows to reuse extract instructions.

+ /// \param ResizeAllowed indicates whether it is allowed to handle subvector

+ /// extract order.

bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

- SmallVectorImpl<unsigned> &CurrentOrder) const;

+ SmallVectorImpl<unsigned> &CurrentOrder,

+ bool ResizeAllowed = false) const;

/// Vectorize a single entry in the tree.

- Value *vectorizeTree(TreeEntry *E);

+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

+ /// avoid issues with def-use order.

+ Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);

/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry

/// \p E.

- Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);

+ /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

+ /// avoid issues with def-use order.

+ Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);

/// Create a new vector from a list of scalar values. Produces a sequence

/// which exploits values reused across lanes, and arranges the inserts

@@ -2477,17 +2389,50 @@ private:

/// instruction in the list).

Instruction &getLastInstructionInBundle(const TreeEntry *E);

- /// Checks if the gathered \p VL can be represented as shuffle(s) of previous

- /// tree entries.

+ /// Tries to find extractelement instructions with constant indices from fixed

+ /// vector type and gather such instructions into a bunch, which highly likely

+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

+ /// was successful, the matched scalars are replaced by poison values in \p VL

+ /// for future analysis.

+ std::optional<TargetTransformInfo::ShuffleKind>

+ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,

+ SmallVectorImpl<int> &Mask) const;

+ /// Tries to find extractelement instructions with constant indices from fixed

+ /// vector type and gather such instructions into a bunch, which highly likely

+ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

+ /// was successful, the matched scalars are replaced by poison values in \p VL

+ /// for future analysis.

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+ tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

+ SmallVectorImpl<int> &Mask,

+ unsigned NumParts) const;

+ /// Checks if the gathered \p VL can be represented as a single register

+ /// shuffle(s) of previous tree entries.

/// \param TE Tree entry checked for permutation.

/// \param VL List of scalars (a subset of the TE scalar), checked for

- /// permutations.

+ /// permutations. Must form single-register vector.

/// \returns ShuffleKind, if gathered values can be represented as shuffles of

- /// previous tree entries. \p Mask is filled with the shuffle mask.

+ /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.

std::optional<TargetTransformInfo::ShuffleKind>

- isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<const TreeEntry *> &Entries);

+ isGatherShuffledSingleRegisterEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);

+ /// Checks if the gathered \p VL can be represented as multi-register

+ /// shuffle(s) of previous tree entries.

+ /// \param TE Tree entry checked for permutation.

+ /// \param VL List of scalars (a subset of the TE scalar), checked for

+ /// permutations.

+ /// \returns per-register series of ShuffleKind, if gathered values can be

+ /// represented as shuffles of previous tree entries. \p Mask is filled with

+ /// the shuffle mask (also on per-register base).

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+ isGatherShuffledEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

+ unsigned NumParts);

/// \returns the scalarization cost for this list of values. Assuming that

/// this subtree gets vectorized, we may need to extract the values from the

@@ -2517,14 +2462,14 @@ private:

/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

/// users of \p TE and collects the stores. It returns the map from the store

/// pointers to the collected stores.

- DenseMap<Value *, SmallVector<StoreInst *, 4>>

+ DenseMap<Value *, SmallVector<StoreInst *>>

collectUserStores(const BoUpSLP::TreeEntry *TE) const;

/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

- /// stores in \p StoresVec can form a vector instruction. If so it returns true

- /// and populates \p ReorderIndices with the shuffle indices of the the stores

- /// when compared to the sorted vector.

- bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+ /// stores in \p StoresVec can form a vector instruction. If so it returns

+ /// true and populates \p ReorderIndices with the shuffle indices of the

+ /// stores when compared to the sorted vector.

+ bool canFormVector(ArrayRef<StoreInst *> StoresVec,

OrdersType &ReorderIndices) const;

/// Iterates through the users of \p TE, looking for scalar stores that can be

@@ -2621,10 +2566,18 @@ private:

/// The Scalars are vectorized into this value. It is initialized to Null.

WeakTrackingVH VectorizedValue = nullptr;

+ /// New vector phi instructions emitted for the vectorized phi nodes.

+ PHINode *PHI = nullptr;

/// Do we need to gather this sequence or vectorize it

/// (either with vector instruction or with scatter/gather

/// intrinsics for store/load)?

- enum EntryState { Vectorize, ScatterVectorize, NeedToGather };

+ enum EntryState {

+ Vectorize,

+ ScatterVectorize,

+ PossibleStridedVectorize,

+ NeedToGather

+ };

EntryState State;

/// Does this sequence require some shuffling?

@@ -2772,6 +2725,14 @@ private:

return FoundLane;

}

+ /// Build a shuffle mask for graph entry which represents a merge of main

+ /// and alternate operations.

+ void

+ buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,

+ SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars = nullptr,

+ SmallVectorImpl<Value *> *AltScalars = nullptr) const;

#ifndef NDEBUG

/// Debug printer.

LLVM_DUMP_METHOD void dump() const {

@@ -2792,6 +2753,9 @@ private:

case ScatterVectorize:

dbgs() << "ScatterVectorize\n";

break;

+ case PossibleStridedVectorize:

+ dbgs() << "PossibleStridedVectorize\n";

+ break;

case NeedToGather:

dbgs() << "NeedToGather\n";

break;

@@ -2892,7 +2856,14 @@ private:

}

if (Last->State != TreeEntry::NeedToGather) {

for (Value *V : VL) {

- assert(!getTreeEntry(V) && "Scalar already in tree!");

+ const TreeEntry *TE = getTreeEntry(V);

+ assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&

+ "Scalar already in tree!");

+ if (TE) {

+ if (TE != Last)

+ MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);

+ continue;

+ }

ScalarToTreeEntry[V] = Last;

}

// Update the scheduler bundle to point to this TreeEntry.

@@ -2905,7 +2876,8 @@ private:

for (Value *V : VL) {

if (doesNotNeedToBeScheduled(V))

continue;

- assert(BundleMember && "Unexpected end of bundle.");

+ if (!BundleMember)

+ continue;

BundleMember->TE = Last;

BundleMember = BundleMember->NextInBundle;

}

@@ -2913,6 +2885,10 @@ private:

assert(!BundleMember && "Bundle and VL out of sync");

} else {

MustGather.insert(VL.begin(), VL.end());

+ // Build a map for gathered scalars to the nodes where they are used.

+ for (Value *V : VL)

+ if (!isConstant(V))

+ ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);

}

if (UserTreeIdx.UserTE)

@@ -2950,6 +2926,10 @@ private:

/// Maps a specific scalar to its tree entry.

SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;

+ /// List of scalars, used in several vectorize nodes, and the list of the

+ /// nodes.

+ SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;

/// Maps a value to the proposed vectorizable size.

SmallDenseMap<Value *, unsigned> InstrElementSize;

@@ -2995,25 +2975,25 @@ private:

/// is invariant in the calling loop.

bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,

Instruction *Inst2) {

+ if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))

+ return true;

// First check if the result is already in the cache.

- AliasCacheKey key = std::make_pair(Inst1, Inst2);

- std::optional<bool> &result = AliasCache[key];

- if (result) {

- return *result;

- }

- bool aliased = true;

- if (Loc1.Ptr && isSimple(Inst1))

- aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

+ AliasCacheKey Key = std::make_pair(Inst1, Inst2);

+ auto It = AliasCache.find(Key);

+ if (It != AliasCache.end())

+ return It->second;

+ bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

// Store the result in the cache.

- result = aliased;

- return aliased;

+ AliasCache.try_emplace(Key, Aliased);

+ AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);

+ return Aliased;

}

using AliasCacheKey = std::pair<Instruction *, Instruction *>;

/// Cache for alias results.

/// TODO: consider moving this to the AliasAnalysis itself.

- DenseMap<AliasCacheKey, std::optional<bool>> AliasCache;

+ DenseMap<AliasCacheKey, bool> AliasCache;

// Cache for pointerMayBeCaptured calls inside AA. This is preserved

// globally through SLP because we don't perform any action which

@@ -3047,7 +3027,7 @@ private:

SetVector<Instruction *> GatherShuffleExtractSeq;

/// A list of blocks that we are going to CSE.

- SetVector<BasicBlock *> CSEBlocks;

+ DenseSet<BasicBlock *> CSEBlocks;

/// Contains all scheduling relevant data for an instruction.

/// A ScheduleData either represents a single instruction or a member of an

@@ -3497,7 +3477,7 @@ private:

BasicBlock *BB;

/// Simple memory allocation for ScheduleData.

- std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

+ SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

/// The size of a ScheduleData array in ScheduleDataChunks.

int ChunkSize;

@@ -3607,7 +3587,7 @@ private:

/// where "width" indicates the minimum bit width and "signed" is True if the

/// value must be signed-extended, rather than zero-extended, back to its

/// original width.

- MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;

+ DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;

};

} // end namespace slpvectorizer

@@ -3676,7 +3656,7 @@ template <> struct GraphTraits<BoUpSLP *> {

template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

using TreeEntry = BoUpSLP::TreeEntry;

- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

std::string Str;

@@ -3699,7 +3679,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

const BoUpSLP *) {

if (Entry->State == TreeEntry::NeedToGather)

return "color=red";

- if (Entry->State == TreeEntry::ScatterVectorize)

+ if (Entry->State == TreeEntry::ScatterVectorize ||

+ Entry->State == TreeEntry::PossibleStridedVectorize)

return "color=blue";

return "";

}

@@ -3761,7 +3742,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {

inversePermutation(Order, MaskOrder);

}

reorderReuses(MaskOrder, Mask);

- if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {

+ if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {

Order.clear();

return;

}

@@ -3779,7 +3760,40 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

OrdersType CurrentOrder(NumScalars, NumScalars);

SmallVector<int> Positions;

SmallBitVector UsedPositions(NumScalars);

- const TreeEntry *STE = nullptr;

+ DenseMap<const TreeEntry *, unsigned> UsedEntries;

+ DenseMap<Value *, std::pair<const TreeEntry *, unsigned>> ValueToEntryPos;

+ for (Value *V : TE.Scalars) {

+ if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

+ continue;

+ const auto *LocalSTE = getTreeEntry(V);

+ if (!LocalSTE)

+ continue;

+ unsigned Lane =

+ std::distance(LocalSTE->Scalars.begin(), find(LocalSTE->Scalars, V));

+ if (Lane >= NumScalars)

+ continue;

+ ++UsedEntries.try_emplace(LocalSTE, 0).first->getSecond();

+ ValueToEntryPos.try_emplace(V, LocalSTE, Lane);

+ }

+ if (UsedEntries.empty())

+ return std::nullopt;

+ const TreeEntry &BestSTE =

+ *std::max_element(UsedEntries.begin(), UsedEntries.end(),

+ [](const std::pair<const TreeEntry *, unsigned> &P1,

+ const std::pair<const TreeEntry *, unsigned> &P2) {

+ return P1.second < P2.second;

+ })

+ ->first;

+ UsedEntries.erase(&BestSTE);

+ const TreeEntry *SecondBestSTE = nullptr;

+ if (!UsedEntries.empty())

+ SecondBestSTE =

+ std::max_element(UsedEntries.begin(), UsedEntries.end(),

+ [](const std::pair<const TreeEntry *, unsigned> &P1,

+ const std::pair<const TreeEntry *, unsigned> &P2) {

+ return P1.second < P2.second;

+ })

+ ->first;

// Try to find all gathered scalars that are gets vectorized in other

// vectorize node. Here we can have only one single tree vector node to

// correctly identify order of the gathered scalars.

@@ -3787,58 +3801,56 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

Value *V = TE.Scalars[I];

if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

continue;

- if (const auto *LocalSTE = getTreeEntry(V)) {

- if (!STE)

- STE = LocalSTE;

- else if (STE != LocalSTE)

- // Take the order only from the single vector node.

- return std::nullopt;

- unsigned Lane =

- std::distance(STE->Scalars.begin(), find(STE->Scalars, V));

- if (Lane >= NumScalars)

- return std::nullopt;

- if (CurrentOrder[Lane] != NumScalars) {

- if (Lane != I)

- continue;

- UsedPositions.reset(CurrentOrder[Lane]);

- }

- // The partial identity (where only some elements of the gather node are

- // in the identity order) is good.

- CurrentOrder[Lane] = I;

- UsedPositions.set(I);

+ const auto [LocalSTE, Lane] = ValueToEntryPos.lookup(V);

+ if (!LocalSTE || (LocalSTE != &BestSTE && LocalSTE != SecondBestSTE))

+ continue;

+ if (CurrentOrder[Lane] != NumScalars) {

+ if ((CurrentOrder[Lane] >= BestSTE.Scalars.size() ||

+ BestSTE.Scalars[CurrentOrder[Lane]] == V) &&

+ (Lane != I || LocalSTE == SecondBestSTE))

+ continue;

+ UsedPositions.reset(CurrentOrder[Lane]);

}

+ // The partial identity (where only some elements of the gather node are

+ // in the identity order) is good.

+ CurrentOrder[Lane] = I;

+ UsedPositions.set(I);

}

// Need to keep the order if we have a vector entry and at least 2 scalars or

// the vectorized entry has just 2 scalars.

- if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {

- auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {

- for (unsigned I = 0; I < NumScalars; ++I)

- if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

- return false;

- return true;

- };

- if (IsIdentityOrder(CurrentOrder))

- return OrdersType();

- auto *It = CurrentOrder.begin();

- for (unsigned I = 0; I < NumScalars;) {

- if (UsedPositions.test(I)) {

- ++I;

- continue;

- }

- if (*It == NumScalars) {

- *It = I;

- ++I;

- }

- ++It;

+ if (BestSTE.Scalars.size() != 2 && UsedPositions.count() <= 1)

+ return std::nullopt;

+ auto IsIdentityOrder = [&](ArrayRef<unsigned> CurrentOrder) {

+ for (unsigned I = 0; I < NumScalars; ++I)

+ if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

+ return false;

+ return true;

+ };

+ if (IsIdentityOrder(CurrentOrder))

+ return OrdersType();

+ auto *It = CurrentOrder.begin();

+ for (unsigned I = 0; I < NumScalars;) {

+ if (UsedPositions.test(I)) {

+ ++I;

+ continue;

}

- return std::move(CurrentOrder);

+ if (*It == NumScalars) {

+ *It = I;

+ ++I;

+ }

+ ++It;

}

- return std::nullopt;

+ return std::move(CurrentOrder);

}

namespace {

/// Tracks the state we can represent the loads in the given sequence.

-enum class LoadsState { Gather, Vectorize, ScatterVectorize };

+enum class LoadsState {

+ Gather,

+ Vectorize,

+ ScatterVectorize,

+ PossibleStridedVectorize

+};

} // anonymous namespace

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,

@@ -3898,6 +3910,7 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

if (IsSorted || all_of(PointerOps, [&](Value *P) {

return arePointersCompatible(P, PointerOps.front(), TLI);

})) {

+ bool IsPossibleStrided = false;

if (IsSorted) {

Value *Ptr0;

Value *PtrN;

@@ -3913,6 +3926,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

// Check that the sorted loads are consecutive.

if (static_cast<unsigned>(*Diff) == VL.size() - 1)

return LoadsState::Vectorize;

+ // Simple check if not a strided access - clear order.

+ IsPossibleStrided = *Diff % (VL.size() - 1) == 0;

}

// TODO: need to improve analysis of the pointers, if not all of them are

// GEPs or have > 2 operands, we end up with a gather node, which just

@@ -3934,7 +3949,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&

!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))

- return LoadsState::ScatterVectorize;

+ return IsPossibleStrided ? LoadsState::PossibleStridedVectorize

+ : LoadsState::ScatterVectorize;

}

@@ -4050,7 +4066,8 @@ static bool areTwoInsertFromSameBuildVector(

// Go through the vector operand of insertelement instructions trying to find

// either VU as the original vector for IE2 or V as the original vector for

// IE1.

- SmallSet<int, 8> ReusedIdx;

+ SmallBitVector ReusedIdx(

+ cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());

bool IsReusedIdx = false;

do {

if (IE2 == VU && !IE1)

@@ -4058,16 +4075,18 @@ static bool areTwoInsertFromSameBuildVector(

if (IE1 == V && !IE2)

return V->hasOneUse();

if (IE1 && IE1 != V) {

- IsReusedIdx |=

- !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;

+ unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);

+ IsReusedIdx |= ReusedIdx.test(Idx1);

+ ReusedIdx.set(Idx1);

if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

IE1 = nullptr;

else

IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

}

if (IE2 && IE2 != VU) {

- IsReusedIdx |=

- !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;

+ unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);

+ IsReusedIdx |= ReusedIdx.test(Idx2);

+ ReusedIdx.set(Idx2);

if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

IE2 = nullptr;

else

@@ -4135,13 +4154,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

return std::nullopt; // No need to reorder.

return std::move(ResOrder);

}

- if (TE.State == TreeEntry::Vectorize &&

+ if ((TE.State == TreeEntry::Vectorize ||

+ TE.State == TreeEntry::PossibleStridedVectorize) &&

(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&

!TE.isAltShuffle())

return TE.ReorderIndices;

if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

- auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {

+ auto PHICompare = [&](unsigned I1, unsigned I2) {

+ Value *V1 = TE.Scalars[I1];

+ Value *V2 = TE.Scalars[I2];

if (V1 == V2)

return false;

if (!V1->hasOneUse() || !V2->hasOneUse())

@@ -4180,14 +4202,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

};

if (!TE.ReorderIndices.empty())

return TE.ReorderIndices;

- DenseMap<Value *, unsigned> PhiToId;

- SmallVector<Value *, 4> Phis;

+ DenseMap<unsigned, unsigned> PhiToId;

+ SmallVector<unsigned> Phis(TE.Scalars.size());

+ std::iota(Phis.begin(), Phis.end(), 0);

OrdersType ResOrder(TE.Scalars.size());

- for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {

- PhiToId[TE.Scalars[Id]] = Id;

- Phis.push_back(TE.Scalars[Id]);

- }

- llvm::stable_sort(Phis, PHICompare);

+ for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)

+ PhiToId[Id] = Id;

+ stable_sort(Phis, PHICompare);

for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)

ResOrder[Id] = PhiToId[Phis[Id]];

if (IsIdentityOrder(ResOrder))

@@ -4214,7 +4235,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

// Check that gather of extractelements can be represented as

// just a shuffle of a single vector.

OrdersType CurrentOrder;

- bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);

+ bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,

+ /*ResizeAllowed=*/true);

if (Reuse || !CurrentOrder.empty()) {

if (!CurrentOrder.empty())

fixupOrderingIndices(CurrentOrder);

@@ -4270,7 +4292,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {

static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

unsigned Sz) {

ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

- if (ShuffleVectorInst::isIdentityMask(FirstCluster))

+ if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))

return false;

for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {

ArrayRef<int> Cluster = Mask.slice(I, Sz);

@@ -4386,7 +4408,9 @@ void BoUpSLP::reorderTopToBottom() {

++Cnt;

}

VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

+ if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) ||

+ !TE->ReuseShuffleIndices.empty())

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

if (TE->State == TreeEntry::Vectorize &&

TE->getOpcode() == Instruction::PHI)

@@ -4409,6 +4433,9 @@ void BoUpSLP::reorderTopToBottom() {

MapVector<OrdersType, unsigned,

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

OrdersUses;

+ // Last chance orders - scatter vectorize. Try to use their orders if no

+ // other orders or the order is counted already.

+ SmallVector<OrdersType> StridedVectorizeOrders;

SmallPtrSet<const TreeEntry *, 4> VisitedOps;

for (const TreeEntry *OpTE : OrderedEntries) {

// No need to reorder this nodes, still need to extend and to use shuffle,

@@ -4455,6 +4482,11 @@ void BoUpSLP::reorderTopToBottom() {

if (Order.empty())

continue;

}

+ // Postpone scatter orders.

+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {

+ StridedVectorizeOrders.push_back(Order);

+ continue;

+ }

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -4472,8 +4504,21 @@ void BoUpSLP::reorderTopToBottom() {

}

// Set order of the user node.

- if (OrdersUses.empty())

- continue;

+ if (OrdersUses.empty()) {

+ if (StridedVectorizeOrders.empty())

+ continue;

+ // Add (potentially!) strided vectorize orders.

+ for (OrdersType &Order : StridedVectorizeOrders)

+ ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

+ } else {

+ // Account (potentially!) strided vectorize orders only if it was used

+ // already.

+ for (OrdersType &Order : StridedVectorizeOrders) {

+ auto *It = OrdersUses.find(Order);

+ if (It != OrdersUses.end())

+ ++It->second;

+ }

// Choose the most used order.

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

unsigned Cnt = OrdersUses.front().second;

@@ -4514,7 +4559,8 @@ void BoUpSLP::reorderTopToBottom() {

}

continue;

}

- if (TE->State == TreeEntry::Vectorize &&

+ if ((TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) &&

isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,

InsertElementInst>(TE->getMainOp()) &&

!TE->isAltShuffle()) {

@@ -4555,6 +4601,10 @@ bool BoUpSLP::canReorderOperands(

}))

continue;

if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {

+ // FIXME: Do not reorder (possible!) strided vectorized nodes, they

+ // require reordering of the operands, which is not implemented yet.

+ if (TE->State == TreeEntry::PossibleStridedVectorize)

+ return false;

// Do not reorder if operand node is used by many user nodes.

if (any_of(TE->UserTreeIndices,

[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

@@ -4567,7 +4617,8 @@ bool BoUpSLP::canReorderOperands(

// simply add to the list of gathered ops.

// If there are reused scalars, process this node as a regular vectorize

// node, just reorder reuses mask.

- if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())

GatherOps.push_back(TE);

continue;

}

@@ -4602,18 +4653,19 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

// Currently the are vectorized loads,extracts without alternate operands +

// some gathering of extracts.

SmallVector<TreeEntry *> NonVectorized;

- for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,

- &NonVectorized](

- const std::unique_ptr<TreeEntry> &TE) {

- if (TE->State != TreeEntry::Vectorize)

+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->State != TreeEntry::PossibleStridedVectorize)

NonVectorized.push_back(TE.get());

if (std::optional<OrdersType> CurrentOrder =

getReorderingData(*TE, /*TopToBottom=*/false)) {

OrderedEntries.insert(TE.get());

- if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

+ if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize) ||

+ !TE->ReuseShuffleIndices.empty())

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

}

- });

+ }

// 1. Propagate order to the graph nodes, which use only reordered nodes.

// I.e., if the node has operands, that are reordered, try to make at least

@@ -4627,6 +4679,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

SmallVector<TreeEntry *> Filtered;

for (TreeEntry *TE : OrderedEntries) {

if (!(TE->State == TreeEntry::Vectorize ||

+ TE->State == TreeEntry::PossibleStridedVectorize ||

(TE->State == TreeEntry::NeedToGather &&

GathersToOrders.count(TE))) ||

TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

@@ -4649,8 +4702,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// Erase filtered entries.

- for_each(Filtered,

- [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });

+ for (TreeEntry *TE : Filtered)

+ OrderedEntries.remove(TE);

SmallVector<

std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

UsersVec(Users.begin(), Users.end());

@@ -4662,10 +4715,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

SmallVector<TreeEntry *> GatherOps;

if (!canReorderOperands(Data.first, Data.second, NonVectorized,

GatherOps)) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

continue;

}

// All operands are reordered and used only in this node - propagate the

@@ -4673,6 +4724,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

MapVector<OrdersType, unsigned,

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

OrdersUses;

+ // Last chance orders - scatter vectorize. Try to use their orders if no

+ // other orders or the order is counted already.

+ SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;

// Do the analysis for each tree entry only once, otherwise the order of

// the same node my be considered several times, though might be not

// profitable.

@@ -4694,6 +4748,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

return P.second == OpTE;

});

+ // Postpone scatter orders.

+ if (OpTE->State == TreeEntry::PossibleStridedVectorize) {

+ StridedVectorizeOrders.emplace_back(Order, NumOps);

+ continue;

+ }

// Stores actually store the mask, not the order, need to invert.

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

@@ -4754,11 +4813,27 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// If no orders - skip current nodes and jump to the next one, if any.

if (OrdersUses.empty()) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

- continue;

+ if (StridedVectorizeOrders.empty() ||

+ (Data.first->ReorderIndices.empty() &&

+ Data.first->ReuseShuffleIndices.empty() &&

+ !(IgnoreReorder &&

+ Data.first == VectorizableTree.front().get()))) {

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

+ continue;

+ }

+ // Add (potentially!) strided vectorize orders.

+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)

+ OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=

+ Pair.second;

+ } else {

+ // Account (potentially!) strided vectorize orders only if it was used

+ // already.

+ for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {

+ auto *It = OrdersUses.find(Pair.first);

+ if (It != OrdersUses.end())

+ It->second += Pair.second;

+ }

}

// Choose the best order.

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

@@ -4771,10 +4846,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

}

// Set order of the user node (reordering of operands and user nodes).

if (BestOrder.empty()) {

- for_each(Data.second,

- [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

- OrderedEntries.remove(Op.second);

- });

+ for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)

+ OrderedEntries.remove(Op.second);

continue;

}

// Erase operands from OrderedEntries list and adjust their orders.

@@ -4796,7 +4869,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

continue;

}

// Gathers are processed separately.

- if (TE->State != TreeEntry::Vectorize)

+ if (TE->State != TreeEntry::Vectorize &&

+ TE->State != TreeEntry::PossibleStridedVectorize &&

+ (TE->State != TreeEntry::ScatterVectorize ||

+ TE->ReorderIndices.empty()))

continue;

assert((BestOrder.size() == TE->ReorderIndices.size() ||

TE->ReorderIndices.empty()) &&

@@ -4825,7 +4901,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

Data.first->isAltShuffle())

Data.first->reorderOperands(Mask);

if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

- Data.first->isAltShuffle()) {

+ Data.first->isAltShuffle() ||

+ Data.first->State == TreeEntry::PossibleStridedVectorize) {

reorderScalars(Data.first->Scalars, Mask);

reorderOrder(Data.first->ReorderIndices, MaskOrder);

if (Data.first->ReuseShuffleIndices.empty() &&

@@ -4859,10 +4936,12 @@ void BoUpSLP::buildExternalUses(

// For each lane:

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

Value *Scalar = Entry->Scalars[Lane];

+ if (!isa<Instruction>(Scalar))

+ continue;

int FoundLane = Entry->findLaneForValue(Scalar);

// Check if the scalar is externally used as an extra arg.

- auto ExtI = ExternallyUsedValues.find(Scalar);

+ const auto *ExtI = ExternallyUsedValues.find(Scalar);

if (ExtI != ExternallyUsedValues.end()) {

LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "

<< Lane << " from " << *Scalar << ".\n");

@@ -4886,7 +4965,8 @@ void BoUpSLP::buildExternalUses(

// be used.

if (UseScalar != U ||

UseEntry->State == TreeEntry::ScatterVectorize ||

- !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

+ UseEntry->State == TreeEntry::PossibleStridedVectorize ||

+ !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U

<< ".\n");

assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");

@@ -4906,9 +4986,9 @@ void BoUpSLP::buildExternalUses(

}

-DenseMap<Value *, SmallVector<StoreInst *, 4>>

+DenseMap<Value *, SmallVector<StoreInst *>>

BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;

+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;

for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

Value *V = TE->Scalars[Lane];

// To save compilation time we don't visit if we have too many users.

@@ -4947,14 +5027,14 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

return PtrToStoresMap;

}

-bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

+bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,

OrdersType &ReorderIndices) const {

// We check whether the stores in StoreVec can form a vector by sorting them

// and checking whether they are consecutive.

// To avoid calling getPointersDiff() while sorting we create a vector of

// pairs {store, offset from first} and sort this instead.

- SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());

+ SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());

StoreInst *S0 = StoresVec[0];

StoreOffsetVec[0] = {S0, 0};

Type *S0Ty = S0->getValueOperand()->getType();

@@ -5023,7 +5103,7 @@ SmallVector<BoUpSLP::OrdersType, 1>

BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {

unsigned NumLanes = TE->Scalars.size();

- DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =

+ DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =

collectUserStores(TE);

// Holds the reorder indices for each candidate store vector that is a user of

@@ -5244,6 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

return TreeEntry::Vectorize;

case LoadsState::ScatterVectorize:

return TreeEntry::ScatterVectorize;

+ case LoadsState::PossibleStridedVectorize:

+ return TreeEntry::PossibleStridedVectorize;

case LoadsState::Gather:

#ifndef NDEBUG

Type *ScalarTy = VL0->getType();

@@ -5416,7 +5498,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

VFShape Shape = VFShape::get(

- *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

+ CI->getFunctionType(),

+ ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

false /*HasGlobalPred*/);

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

@@ -5488,9 +5571,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

SmallVector<int> ReuseShuffleIndicies;

SmallVector<Value *> UniqueValues;

- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,

- &UserTreeIdx,

- this](const InstructionsState &S) {

+ SmallVector<Value *> NonUniqueValueVL;

+ auto TryToFindDuplicates = [&](const InstructionsState &S,

+ bool DoNotFail = false) {

// Check that every instruction appears once in this bundle.

DenseMap<Value *, unsigned> UniquePositions(VL.size());

for (Value *V : VL) {

@@ -5517,6 +5600,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

!isConstant(V);

})) ||

!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {

+ if (DoNotFail && UniquePositions.size() > 1 &&

+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&

+ all_of(UniqueValues, [=](Value *V) {

+ return isa<ExtractElementInst>(V) ||

+ areAllUsersVectorized(cast<Instruction>(V),

+ UserIgnoreList);

+ })) {

+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());

+ if (PWSz == VL.size()) {

+ ReuseShuffleIndicies.clear();

+ } else {

+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());

+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),

+ UniqueValues.back());

+ VL = NonUniqueValueVL;

+ }

+ return true;

+ }

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

return false;

@@ -5528,6 +5629,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

InstructionsState S = getSameOpcode(VL, *TLI);

+ // Don't vectorize ephemeral values.

+ if (!EphValues.empty()) {

+ for (Value *V : VL) {

+ if (EphValues.count(V)) {

+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

+ << ") is ephemeral.\n");

+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

+ return;

+ }

// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

// a load), in which case peek through to include it in the tree, without

// ballooning over-budget.

@@ -5633,7 +5746,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

BasicBlock *BB = nullptr;

bool IsScatterVectorizeUserTE =

UserTreeIdx.UserTE &&

- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||

+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);

bool AreAllSameInsts =

(S.getOpcode() && allSameBlock(VL)) ||

(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&

@@ -5665,39 +5779,44 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// We now know that this is a vector of instructions of the same type from

// the same block.

- // Don't vectorize ephemeral values.

- if (!EphValues.empty()) {

- for (Value *V : VL) {

- if (EphValues.count(V)) {

- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

- << ") is ephemeral.\n");

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);

- return;

- }

// Check if this is a duplicate of another entry.

if (TreeEntry *E = getTreeEntry(S.OpValue)) {

LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");

if (!E->isSame(VL)) {

- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

- if (TryToFindDuplicates(S))

- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

+ auto It = MultiNodeScalars.find(S.OpValue);

+ if (It != MultiNodeScalars.end()) {

+ auto *TEIt = find_if(It->getSecond(),

+ [&](TreeEntry *ME) { return ME->isSame(VL); });

+ if (TEIt != It->getSecond().end())

+ E = *TEIt;

+ else

+ E = nullptr;

+ } else {

+ E = nullptr;

+ }

+ if (!E) {

+ if (!doesNotNeedToBeScheduled(S.OpValue)) {

+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

+ if (TryToFindDuplicates(S))

+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ return;

+ }

+ } else {

+ // Record the reuse of the tree node. FIXME, currently this is only used

+ // to properly draw the graph rather than for the actual vectorization.

+ E->UserTreeIndices.push_back(UserTreeIdx);

+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

+ << ".\n");

return;

}

- // Record the reuse of the tree node. FIXME, currently this is only used to

- // properly draw the graph rather than for the actual vectorization.

- E->UserTreeIndices.push_back(UserTreeIdx);

- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

- << ".\n");

- return;

}

// Check that none of the instructions in the bundle are already in the tree.

for (Value *V : VL) {

- if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))

+ if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||

+ doesNotNeedToBeScheduled(V))

continue;

if (getTreeEntry(V)) {

LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

@@ -5725,7 +5844,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Special processing for sorted pointers for ScatterVectorize node with

// constant indeces only.

if (AreAllSameInsts && UserTreeIdx.UserTE &&

- UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&

+ (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||

+ UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&

!(S.getOpcode() && allSameBlock(VL))) {

assert(S.OpValue->getType()->isPointerTy() &&

count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=

@@ -5760,7 +5880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// Check that every instruction appears once in this bundle.

- if (!TryToFindDuplicates(S))

+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true))

return;

// Perform specific checks for each particular instruction kind.

@@ -5780,7 +5900,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

BlockScheduling &BS = *BSRef;

- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);

+ std::optional<ScheduleData *> Bundle =

+ BS.tryScheduleBundle(UniqueValues, this, S);

#ifdef EXPENSIVE_CHECKS

// Make sure we didn't break any internal invariants

BS.verify();

@@ -5905,6 +6026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// from such a struct, we read/write packed bits disagreeing with the

// unvectorized version.

TreeEntry *TE = nullptr;

+ fixupOrderingIndices(CurrentOrder);

switch (State) {

case TreeEntry::Vectorize:

if (CurrentOrder.empty()) {

@@ -5913,7 +6035,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");

} else {

- fixupOrderingIndices(CurrentOrder);

// Need to reorder.

TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies, CurrentOrder);

@@ -5921,6 +6042,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

break;

+ case TreeEntry::PossibleStridedVectorize:

+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.

+ if (CurrentOrder.empty()) {

+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ } else {

+ TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,

+ UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);

+ }

+ TE->setOperandsInOrder();

+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");

+ break;

case TreeEntry::ScatterVectorize:

// Vectorizing non-consecutive loads with `llvm.masked.gather`.

TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

@@ -5951,13 +6085,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6031,13 +6165,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6087,8 +6221,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (!CI)

Operands.back().push_back(Op);

else

- Operands.back().push_back(ConstantExpr::getIntegerCast(

- CI, Ty, CI->getValue().isSignBitSet()));

+ Operands.back().push_back(ConstantFoldIntegerCast(

+ CI, Ty, CI->getValue().isSignBitSet(), *DL));

}

TE->setOperand(IndexIdx, Operands.back());

@@ -6132,18 +6266,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

TE->setOperandsInOrder();

- for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

- // For scalar operands no need to to create an entry since no need to

+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {

+ // For scalar operands no need to create an entry since no need to

// vectorize it.

- if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I))

continue;

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL) {

auto *CI2 = cast<CallInst>(V);

- Operands.push_back(CI2->getArgOperand(i));

+ Operands.push_back(CI2->getArgOperand(I));

}

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6194,13 +6328,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

TE->setOperandsInOrder();

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {

ValueList Operands;

// Prepare the operand vector.

for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(i));

+ Operands.push_back(cast<Instruction>(V)->getOperand(I));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ buildTree_rec(Operands, Depth + 1, {TE, I});

}

return;

}

@@ -6210,7 +6344,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

llvm_unreachable("Unexpected vectorization of the instructions.");

}

-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

+unsigned BoUpSLP::canMapToVector(Type *T) const {

unsigned N = 1;

Type *EltTy = T;

@@ -6234,15 +6368,16 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

if (!isValidElementType(EltTy))

return 0;

- uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

+ uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

- VTSize != DL.getTypeStoreSizeInBits(T))

+ VTSize != DL->getTypeStoreSizeInBits(T))

return 0;

return N;

}

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

- SmallVectorImpl<unsigned> &CurrentOrder) const {

+ SmallVectorImpl<unsigned> &CurrentOrder,

+ bool ResizeAllowed) const {

const auto *It = find_if(VL, [](Value *V) {

return isa<ExtractElementInst, ExtractValueInst>(V);

});

@@ -6263,8 +6398,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

// We have to extract from a vector/aggregate with the same number of elements.

unsigned NElts;

if (E0->getOpcode() == Instruction::ExtractValue) {

- const DataLayout &DL = E0->getModule()->getDataLayout();

- NElts = canMapToVector(Vec->getType(), DL);

+ NElts = canMapToVector(Vec->getType());

if (!NElts)

return false;

// Check if load can be rewritten as load of vector.

@@ -6275,46 +6409,55 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

}

- if (NElts != VL.size())

- return false;

- // Check that all of the indices extract from the correct offset.

- bool ShouldKeepOrder = true;

unsigned E = VL.size();

- // Assign to all items the initial value E + 1 so we can check if the extract

- // instruction index was used already.

- // Also, later we can check that all the indices are used and we have a

- // consecutive access in the extract instructions, by checking that no

- // element of CurrentOrder still has value E + 1.

- CurrentOrder.assign(E, E);

- unsigned I = 0;

- for (; I < E; ++I) {

- auto *Inst = dyn_cast<Instruction>(VL[I]);

+ if (!ResizeAllowed && NElts != E)

+ return false;

+ SmallVector<int> Indices(E, PoisonMaskElem);

+ unsigned MinIdx = NElts, MaxIdx = 0;

+ for (auto [I, V] : enumerate(VL)) {

+ auto *Inst = dyn_cast<Instruction>(V);

if (!Inst)

continue;

if (Inst->getOperand(0) != Vec)

- break;

+ return false;

if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

if (isa<UndefValue>(EE->getIndexOperand()))

continue;

std::optional<unsigned> Idx = getExtractIndex(Inst);

if (!Idx)

- break;

+ return false;

const unsigned ExtIdx = *Idx;

- if (ExtIdx != I) {

- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)

- break;

- ShouldKeepOrder = false;

- CurrentOrder[ExtIdx] = I;

- } else {

- if (CurrentOrder[I] != E)

- break;

- CurrentOrder[I] = I;

- }

+ if (ExtIdx >= NElts)

+ continue;

+ Indices[I] = ExtIdx;

+ if (MinIdx > ExtIdx)

+ MinIdx = ExtIdx;

+ if (MaxIdx < ExtIdx)

+ MaxIdx = ExtIdx;

}

- if (I < E) {

- CurrentOrder.clear();

+ if (MaxIdx - MinIdx + 1 > E)

return false;

+ if (MaxIdx + 1 <= E)

+ MinIdx = 0;

+ // Check that all of the indices extract from the correct offset.

+ bool ShouldKeepOrder = true;

+ // Assign to all items the initial value E + 1 so we can check if the extract

+ // instruction index was used already.

+ // Also, later we can check that all the indices are used and we have a

+ // consecutive access in the extract instructions, by checking that no

+ // element of CurrentOrder still has value E + 1.

+ CurrentOrder.assign(E, E);

+ for (unsigned I = 0; I < E; ++I) {

+ if (Indices[I] == PoisonMaskElem)

+ continue;

+ const unsigned ExtIdx = Indices[I] - MinIdx;

+ if (CurrentOrder[ExtIdx] != E) {

+ CurrentOrder.clear();

+ return false;

+ }

+ ShouldKeepOrder &= ExtIdx == I;

+ CurrentOrder[ExtIdx] = I;

}

if (ShouldKeepOrder)

CurrentOrder.clear();

@@ -6322,9 +6465,9 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

return ShouldKeepOrder;

}

-bool BoUpSLP::areAllUsersVectorized(Instruction *I,

- ArrayRef<Value *> VectorizedVals) const {

- return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

+bool BoUpSLP::areAllUsersVectorized(

+ Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {

+ return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||

all_of(I->users(), [this](User *U) {

return ScalarToTreeEntry.count(U) > 0 ||

isVectorLikeInstWithConstOps(U) ||

@@ -6351,8 +6494,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

auto IntrinsicCost =

TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

- auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

- VecTy->getNumElements())),

+ auto Shape = VFShape::get(CI->getFunctionType(),

+ ElementCount::getFixed(VecTy->getNumElements()),

false /*HasGlobalPred*/);

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

auto LibCost = IntrinsicCost;

@@ -6365,16 +6508,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

return {IntrinsicCost, LibCost};

}

-/// Build shuffle mask for shuffle graph entries and lists of main and alternate

-/// operations operands.

-static void

-buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

- ArrayRef<int> ReusesIndices,

- const function_ref<bool(Instruction *)> IsAltOp,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<Value *> *OpScalars = nullptr,

- SmallVectorImpl<Value *> *AltScalars = nullptr) {

- unsigned Sz = VL.size();

+void BoUpSLP::TreeEntry::buildAltOpShuffleMask(

+ const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<Value *> *OpScalars,

+ SmallVectorImpl<Value *> *AltScalars) const {

+ unsigned Sz = Scalars.size();

Mask.assign(Sz, PoisonMaskElem);

SmallVector<int> OrderMask;

if (!ReorderIndices.empty())

@@ -6383,7 +6521,7 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

unsigned Idx = I;

if (!ReorderIndices.empty())

Idx = OrderMask[I];

- auto *OpInst = cast<Instruction>(VL[Idx]);

+ auto *OpInst = cast<Instruction>(Scalars[Idx]);

if (IsAltOp(OpInst)) {

Mask[I] = Sz + Idx;

if (AltScalars)

@@ -6394,9 +6532,9 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

OpScalars->push_back(OpInst);

}

- if (!ReusesIndices.empty()) {

- SmallVector<int> NewMask(ReusesIndices.size(), PoisonMaskElem);

- transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {

+ if (!ReuseShuffleIndices.empty()) {

+ SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);

+ transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {

return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

});

Mask.swap(NewMask);

@@ -6429,52 +6567,27 @@ static bool isAlternateInstruction(const Instruction *I,

return I->getOpcode() == AltOp->getOpcode();

}

-TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,

- unsigned OpIdx) {

- assert(!VL.empty());

- const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));

- const auto *Op0 = I0->getOperand(OpIdx);

+TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {

+ assert(!Ops.empty());

+ const auto *Op0 = Ops.front();

- const bool IsConstant = all_of(VL, [&](Value *V) {

+ const bool IsConstant = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I)

- return true;

- auto *Op = I->getOperand(OpIdx);

- return isConstant(Op) && !isa<UndefValue>(Op);

+ return isConstant(V) && !isa<UndefValue>(V);

});

- const bool IsUniform = all_of(VL, [&](Value *V) {

+ const bool IsUniform = all_of(Ops, [=](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I)

- return false;

- return I->getOperand(OpIdx) == Op0;

+ return V == Op0;

});

- const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {

+ const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I) {

- assert((isa<UndefValue>(V) ||

- I0->getOpcode() == Instruction::GetElementPtr) &&

- "Expected undef or GEP.");

- return true;

- }

- auto *Op = I->getOperand(OpIdx);

- if (auto *CI = dyn_cast<ConstantInt>(Op))

+ if (auto *CI = dyn_cast<ConstantInt>(V))

return CI->getValue().isPowerOf2();

return false;

});

- const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {

+ const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {

// TODO: We should allow undef elements here

- const auto *I = dyn_cast<Instruction>(V);

- if (!I) {

- assert((isa<UndefValue>(V) ||

- I0->getOpcode() == Instruction::GetElementPtr) &&

- "Expected undef or GEP.");

- return true;

- }

- const auto *Op = I->getOperand(OpIdx);

- if (auto *CI = dyn_cast<ConstantInt>(Op))

+ if (auto *CI = dyn_cast<ConstantInt>(V))

return CI->getValue().isNegatedPowerOf2();

return false;

});

@@ -6505,9 +6618,24 @@ protected:

bool IsStrict) {

int Limit = Mask.size();

int VF = VecTy->getNumElements();

- return (VF == Limit || !IsStrict) &&

- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(Mask);

+ int Index = -1;

+ if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))

+ return true;

+ if (!IsStrict) {

+ // Consider extract subvector starting from index 0.

+ if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

+ Index == 0)

+ return true;

+ // All VF-size submasks are identity (e.g.

+ // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).

+ if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {

+ ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);

+ return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||

+ ShuffleVectorInst::isIdentityMask(Slice, VF);

+ }))

+ return true;

+ }

+ return false;

}

/// Tries to combine 2 different masks into single one.

@@ -6577,7 +6705,8 @@ protected:

if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {

if (!IdentityOp || !SinglePermute ||

(isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&

- !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {

+ !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,

+ IdentityMask.size()))) {

IdentityOp = SV;

// Store current mask in the IdentityMask so later we did not lost

// this info if IdentityOp is selected as the best candidate for the

@@ -6647,7 +6776,7 @@ protected:

}

if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

!OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

- ShuffleVectorInst::isZeroEltSplatMask(Mask)) {

+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {

if (IdentityOp) {

V = IdentityOp;

assert(Mask.size() == IdentityMask.size() &&

@@ -6663,7 +6792,7 @@ protected:

/*IsStrict=*/true) ||

(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&

Shuffle->isZeroEltSplat() &&

- ShuffleVectorInst::isZeroEltSplatMask(Mask)));

+ ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));

}

V = Op;

return false;

@@ -6768,11 +6897,9 @@ protected:

CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

}

- const int Limit = CombinedMask1.size() * 2;

- if (Op1 == Op2 && Limit == 2 * VF &&

- all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&

- (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||

- (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&

+ if (Op1 == Op2 &&

+ (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||

+ (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&

isa<ShuffleVectorInst>(Op1) &&

cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

ArrayRef(CombinedMask1))))

@@ -6807,10 +6934,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

const TargetTransformInfo &TTI;

InstructionCost Cost = 0;

- ArrayRef<Value *> VectorizedVals;

+ SmallDenseSet<Value *> VectorizedVals;

BoUpSLP &R;

SmallPtrSetImpl<Value *> &CheckedExtracts;

constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ /// While set, still trying to estimate the cost for the same nodes and we

+ /// can delay actual cost estimation (virtual shuffle instruction emission).

+ /// May help better estimate the cost if same nodes must be permuted + allows

+ /// to move most of the long shuffles cost estimation to TTI.

+ bool SameNodesEstimated = true;

+ static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {

+ if (Ty->getScalarType()->isPointerTy()) {

+ Constant *Res = ConstantExpr::getIntToPtr(

+ ConstantInt::getAllOnesValue(

+ IntegerType::get(Ty->getContext(),

+ DL.getTypeStoreSizeInBits(Ty->getScalarType()))),

+ Ty->getScalarType());

+ if (auto *VTy = dyn_cast<VectorType>(Ty))

+ Res = ConstantVector::getSplat(VTy->getElementCount(), Res);

+ return Res;

+ }

+ return Constant::getAllOnesValue(Ty);

+ }

InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {

if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))

@@ -6821,20 +6967,35 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

// Improve gather cost for gather of loads, if we can group some of the

// loads into vector loads.

InstructionsState S = getSameOpcode(VL, *R.TLI);

- if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&

- !S.isAltShuffle() &&

+ const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());

+ unsigned MinVF = R.getMinVF(2 * Sz);

+ if (VL.size() > 2 &&

+ ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||

+ (InVectors.empty() &&

+ any_of(seq<unsigned>(0, VL.size() / MinVF),

+ [&](unsigned Idx) {

+ ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);

+ InstructionsState S = getSameOpcode(SubVL, *R.TLI);

+ return S.getOpcode() == Instruction::Load &&

+ !S.isAltShuffle();

+ }))) &&

!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&

!isSplat(Gathers)) {

- BoUpSLP::ValueSet VectorizedLoads;

+ SetVector<Value *> VectorizedLoads;

+ SmallVector<LoadInst *> VectorizedStarts;

+ SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;

unsigned StartIdx = 0;

unsigned VF = VL.size() / 2;

- unsigned VectorizedCnt = 0;

- unsigned ScatterVectorizeCnt = 0;

- const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());

- for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

+ for (; VF >= MinVF; VF /= 2) {

for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

Cnt += VF) {

ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

+ if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {

+ InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);

+ if (SliceS.getOpcode() != Instruction::Load ||

+ SliceS.isAltShuffle())

+ continue;

+ }

if (!VectorizedLoads.count(Slice.front()) &&

!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

SmallVector<Value *> PointerOps;

@@ -6845,12 +7006,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

switch (LS) {

case LoadsState::Vectorize:

case LoadsState::ScatterVectorize:

+ case LoadsState::PossibleStridedVectorize:

// Mark the vectorized loads so that we don't vectorize them

// again.

- if (LS == LoadsState::Vectorize)

- ++VectorizedCnt;

+ // TODO: better handling of loads with reorders.

+ if (LS == LoadsState::Vectorize && CurrentOrder.empty())

+ VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));

else

- ++ScatterVectorizeCnt;

+ ScatterVectorized.emplace_back(Cnt, VF);

VectorizedLoads.insert(Slice.begin(), Slice.end());

// If we vectorized initial block, no need to try to vectorize

// it again.

@@ -6881,8 +7044,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

}

// Exclude potentially vectorized loads from list of gathered

// scalars.

- auto *LI = cast<LoadInst>(S.MainOp);

- Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));

+ Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));

// The cost for vectorized loads.

InstructionCost ScalarsCost = 0;

for (Value *V : VectorizedLoads) {

@@ -6892,17 +7054,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

LI->getAlign(), LI->getPointerAddressSpace(),

CostKind, TTI::OperandValueInfo(), LI);

}

- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

- Align Alignment = LI->getAlign();

- GatherCost +=

- VectorizedCnt *

- TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

- LI->getPointerAddressSpace(), CostKind,

- TTI::OperandValueInfo(), LI);

- GatherCost += ScatterVectorizeCnt *

- TTI.getGatherScatterOpCost(

- Instruction::Load, LoadTy, LI->getPointerOperand(),

- /*VariableMask=*/false, Alignment, CostKind, LI);

+ auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);

+ for (LoadInst *LI : VectorizedStarts) {

+ Align Alignment = LI->getAlign();

+ GatherCost +=

+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

+ LI->getPointerAddressSpace(), CostKind,

+ TTI::OperandValueInfo(), LI);

+ }

+ for (std::pair<unsigned, unsigned> P : ScatterVectorized) {

+ auto *LI0 = cast<LoadInst>(VL[P.first]);

+ Align CommonAlignment = LI0->getAlign();

+ for (Value *V : VL.slice(P.first + 1, VF - 1))

+ CommonAlignment =

+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

+ GatherCost += TTI.getGatherScatterOpCost(

+ Instruction::Load, LoadTy, LI0->getPointerOperand(),

+ /*VariableMask=*/false, CommonAlignment, CostKind, LI0);

+ }

if (NeedInsertSubvectorAnalysis) {

// Add the cost for the subvectors insert.

for (int I = VF, E = VL.size(); I < E; I += VF)

@@ -6938,77 +7107,137 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

: R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));

};

- /// Compute the cost of creating a vector of type \p VecTy containing the

- /// extracted values from \p VL.

- InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

- TTI::ShuffleKind ShuffleKind) {

- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

- unsigned NumOfParts = TTI.getNumberOfParts(VecTy);

- if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||

- !NumOfParts || VecTy->getNumElements() < NumOfParts)

- return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);

- bool AllConsecutive = true;

- unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;

- unsigned Idx = -1;

+ /// Compute the cost of creating a vector containing the extracted values from

+ /// \p VL.

+ InstructionCost

+ computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts) {

+ assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");

+ unsigned NumElts =

+ std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {

+ auto *EE = dyn_cast<ExtractElementInst>(V);

+ if (!EE)

+ return Sz;

+ auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());

+ return std::max(Sz, VecTy->getNumElements());

+ });

+ unsigned NumSrcRegs = TTI.getNumberOfParts(

+ FixedVectorType::get(VL.front()->getType(), NumElts));

+ if (NumSrcRegs == 0)

+ NumSrcRegs = 1;

+ // FIXME: this must be moved to TTI for better estimation.

+ unsigned EltsPerVector = PowerOf2Ceil(std::max(

+ divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));

+ auto CheckPerRegistersShuffle =

+ [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {

+ DenseSet<int> RegIndices;

+ // Check that if trying to permute same single/2 input vectors.

+ TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;

+ int FirstRegId = -1;

+ for (int &I : Mask) {

+ if (I == PoisonMaskElem)

+ continue;

+ int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;

+ if (FirstRegId < 0)

+ FirstRegId = RegId;

+ RegIndices.insert(RegId);

+ if (RegIndices.size() > 2)

+ return std::nullopt;

+ if (RegIndices.size() == 2)

+ ShuffleKind = TTI::SK_PermuteTwoSrc;

+ I = (I % NumElts) % EltsPerVector +

+ (RegId == FirstRegId ? 0 : EltsPerVector);

+ }

+ return ShuffleKind;

+ };

InstructionCost Cost = 0;

// Process extracts in blocks of EltsPerVector to check if the source vector

// operand can be re-used directly. If not, add the cost of creating a

// shuffle to extract the values into a vector register.

- SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);

- for (auto *V : VL) {

- ++Idx;

- // Reached the start of a new vector registers.

- if (Idx % EltsPerVector == 0) {

- RegMask.assign(EltsPerVector, PoisonMaskElem);

- AllConsecutive = true;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ if (!ShuffleKinds[Part])

continue;

- }

- // Need to exclude undefs from analysis.

- if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)

+ ArrayRef<int> MaskSlice =

+ Mask.slice(Part * EltsPerVector,

+ (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)

+ ? Mask.size() % EltsPerVector

+ : EltsPerVector);

+ SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);

+ copy(MaskSlice, SubMask.begin());

+ std::optional<TTI::ShuffleKind> RegShuffleKind =

+ CheckPerRegistersShuffle(SubMask);

+ if (!RegShuffleKind) {

+ Cost += TTI.getShuffleCost(

+ *ShuffleKinds[Part],

+ FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);

continue;

- // Check all extracts for a vector register on the target directly

- // extract values in order.

- unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

- if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) {

- unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

- AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

- CurrentIdx % EltsPerVector == Idx % EltsPerVector;

- RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

}

- if (AllConsecutive)

- continue;

- // Skip all indices, except for the last index per vector block.

- if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())

- continue;

- // If we have a series of extracts which are not consecutive and hence

- // cannot re-use the source vector register directly, compute the shuffle

- // cost to extract the vector with EltsPerVector elements.

- Cost += TTI.getShuffleCost(

- TargetTransformInfo::SK_PermuteSingleSrc,

- FixedVectorType::get(VecTy->getElementType(), EltsPerVector),

- RegMask);

+ if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||

+ !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {

+ Cost += TTI.getShuffleCost(

+ *RegShuffleKind,

+ FixedVectorType::get(VL.front()->getType(), EltsPerVector),

+ SubMask);

+ }

}

return Cost;

}

+ /// Transforms mask \p CommonMask per given \p Mask to make proper set after

+ /// shuffle emission.

+ static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

+ ArrayRef<int> Mask) {

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (Mask[Idx] != PoisonMaskElem)

+ CommonMask[Idx] = Idx;

+ }

+ /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given

+ /// mask \p Mask, register number \p Part, that includes \p SliceSize

+ /// elements.

+ void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,

+ ArrayRef<int> Mask, unsigned Part,

+ unsigned SliceSize) {

+ if (SameNodesEstimated) {

+ // Delay the cost estimation if the same nodes are reshuffling.

+ // If we already requested the cost of reshuffling of E1 and E2 before, no

+ // need to estimate another cost with the sub-Mask, instead include this

+ // sub-Mask into the CommonMask to estimate it later and avoid double cost

+ // estimation.

+ if ((InVectors.size() == 2 &&

+ InVectors.front().get<const TreeEntry *>() == &E1 &&

+ InVectors.back().get<const TreeEntry *>() == E2) ||

+ (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {

+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),

+ [](int Idx) { return Idx == PoisonMaskElem; }) &&

+ "Expected all poisoned elements.");

+ ArrayRef<int> SubMask =

+ ArrayRef(Mask).slice(Part * SliceSize, SliceSize);

+ copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));

+ return;

+ }

+ // Found non-matching nodes - need to estimate the cost for the matched

+ // and transform mask.

+ Cost += createShuffle(InVectors.front(),

+ InVectors.size() == 1 ? nullptr : InVectors.back(),

+ CommonMask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ }

+ SameNodesEstimated = false;

+ Cost += createShuffle(&E1, E2, Mask);

+ transformMaskAfterShuffle(CommonMask, Mask);

+ }

class ShuffleCostBuilder {

const TargetTransformInfo &TTI;

static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {

- int Limit = 2 * VF;

+ int Index = -1;

return Mask.empty() ||

(VF == Mask.size() &&

- all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(Mask));

+ ShuffleVectorInst::isIdentityMask(Mask, VF)) ||

+ (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

+ Index == 0);

}

public:

@@ -7021,21 +7250,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

if (isEmptyOrIdentity(Mask, VF))

return TTI::TCC_Free;

- return TTI.getShuffleCost(

- TTI::SK_PermuteTwoSrc,

- FixedVectorType::get(

- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

- Mask);

+ return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,

+ cast<VectorType>(V1->getType()), Mask);

}

InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {

// Empty mask or identity mask are free.

- if (isEmptyOrIdentity(Mask, Mask.size()))

+ unsigned VF =

+ cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

+ if (isEmptyOrIdentity(Mask, VF))

return TTI::TCC_Free;

- return TTI.getShuffleCost(

- TTI::SK_PermuteSingleSrc,

- FixedVectorType::get(

- cast<VectorType>(V1->getType())->getElementType(), Mask.size()),

- Mask);

+ return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,

+ cast<VectorType>(V1->getType()), Mask);

}

InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }

InstructionCost createPoison(Type *Ty, unsigned VF) const {

@@ -7052,139 +7277,226 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

const PointerUnion<Value *, const TreeEntry *> &P2,

ArrayRef<int> Mask) {

ShuffleCostBuilder Builder(TTI);

+ SmallVector<int> CommonMask(Mask.begin(), Mask.end());

Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

- unsigned CommonVF = 0;

- if (!V1) {

+ unsigned CommonVF = Mask.size();

+ if (!V1 && !V2 && !P2.isNull()) {

+ // Shuffle 2 entry nodes.

const TreeEntry *E = P1.get<const TreeEntry *>();

unsigned VF = E->getVectorFactor();

- if (V2) {

- unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

- if (V2VF != VF && V2VF == E->Scalars.size())

- VF = E->Scalars.size();

- } else if (!P2.isNull()) {

- const TreeEntry *E2 = P2.get<const TreeEntry *>();

- if (E->Scalars.size() == E2->Scalars.size())

- CommonVF = VF = E->Scalars.size();

- } else {

- // P2 is empty, check that we have same node + reshuffle (if any).

- if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

- VF = E->Scalars.size();

- SmallVector<int> CommonMask(Mask.begin(), Mask.end());

- ::addMask(CommonMask, E->getCommonMask());

- V1 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- return BaseShuffleAnalysis::createShuffle<InstructionCost>(

- V1, nullptr, CommonMask, Builder);

+ const TreeEntry *E2 = P2.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E2->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E->Scalars.size() == E2->Scalars.size()) {

+ SmallVector<int> EMask = E->getCommonMask();

+ SmallVector<int> E2Mask = E2->getCommonMask();

+ if (!EMask.empty() || !E2Mask.empty()) {

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx < static_cast<int>(CommonVF) && !EMask.empty())

+ Idx = EMask[Idx];

+ else if (Idx >= static_cast<int>(CommonVF))

+ Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +

+ E->Scalars.size();

+ }

}

+ CommonVF = E->Scalars.size();

}

V1 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- }

- if (!V2 && !P2.isNull()) {

- const TreeEntry *E = P2.get<const TreeEntry *>();

+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ } else if (!V1 && P2.isNull()) {

+ // Shuffle single entry node.

+ const TreeEntry *E = P1.get<const TreeEntry *>();

unsigned VF = E->getVectorFactor();

- unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

- if (!CommonVF && V1VF == E->Scalars.size())

+ CommonVF = VF;

+ assert(

+ all_of(Mask,

+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

+ "All elements in mask must be less than CommonVF.");

+ if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

+ SmallVector<int> EMask = E->getCommonMask();

+ assert(!EMask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx != PoisonMaskElem)

+ Idx = EMask[Idx];

+ }

CommonVF = E->Scalars.size();

- if (CommonVF)

- VF = CommonVF;

- V2 = Constant::getNullValue(

- FixedVectorType::get(E->Scalars.front()->getType(), VF));

- }

- return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,

- Builder);

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));

+ } else if (V1 && P2.isNull()) {

+ // Shuffle single vector.

+ CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ assert(

+ all_of(Mask,

+ [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

+ "All elements in mask must be less than CommonVF.");

+ } else if (V1 && !V2) {

+ // Shuffle vector and tree node.

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ const TreeEntry *E2 = P2.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E2->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E2->Scalars.size() == VF && VF != CommonVF) {

+ SmallVector<int> E2Mask = E2->getCommonMask();

+ assert(!E2Mask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx >= static_cast<int>(CommonVF))

+ Idx = E2Mask[Idx - CommonVF] + VF;

+ }

+ CommonVF = VF;

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL,

+ FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));

+ } else if (!V1 && V2) {

+ // Shuffle vector and tree node.

+ unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();

+ const TreeEntry *E1 = P1.get<const TreeEntry *>();

+ CommonVF = std::max(VF, E1->getVectorFactor());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (E1->Scalars.size() == VF && VF != CommonVF) {

+ SmallVector<int> E1Mask = E1->getCommonMask();

+ assert(!E1Mask.empty() && "Expected non-empty common mask.");

+ for (int &Idx : CommonMask) {

+ if (Idx == PoisonMaskElem)

+ continue;

+ if (Idx >= static_cast<int>(CommonVF))

+ Idx = E1Mask[Idx - CommonVF] + VF;

+ }

+ CommonVF = VF;

+ }

+ V1 = Constant::getNullValue(

+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL,

+ FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));

+ } else {

+ assert(V1 && V2 && "Expected both vectors.");

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ CommonVF =

+ std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());

+ assert(all_of(Mask,

+ [=](int Idx) {

+ return Idx < 2 * static_cast<int>(CommonVF);

+ }) &&

+ "All elements in mask must be less than 2 * CommonVF.");

+ if (V1->getType() != V2->getType()) {

+ V1 = Constant::getNullValue(FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));

+ V2 = getAllOnesValue(

+ *R.DL, FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(),

+ CommonVF));

+ }

+ InVectors.front() = Constant::getNullValue(FixedVectorType::get(

+ cast<FixedVectorType>(V1->getType())->getElementType(),

+ CommonMask.size()));

+ if (InVectors.size() == 2)

+ InVectors.pop_back();

+ return BaseShuffleAnalysis::createShuffle<InstructionCost>(

+ V1, V2, CommonMask, Builder);

}

public:

ShuffleCostEstimator(TargetTransformInfo &TTI,

ArrayRef<Value *> VectorizedVals, BoUpSLP &R,

SmallPtrSetImpl<Value *> &CheckedExtracts)

- : TTI(TTI), VectorizedVals(VectorizedVals), R(R),

- CheckedExtracts(CheckedExtracts) {}

- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,

- TTI::ShuffleKind ShuffleKind) {

+ : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),

+ R(R), CheckedExtracts(CheckedExtracts) {}

+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts, bool &UseVecBaseAsInput) {

+ UseVecBaseAsInput = false;

if (Mask.empty())

return nullptr;

Value *VecBase = nullptr;

ArrayRef<Value *> VL = E->Scalars;

- auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());

// If the resulting type is scalarized, do not adjust the cost.

- unsigned VecNumParts = TTI.getNumberOfParts(VecTy);

- if (VecNumParts == VecTy->getNumElements())

+ if (NumParts == VL.size())

return nullptr;

- DenseMap<Value *, int> ExtractVectorsTys;

- for (auto [I, V] : enumerate(VL)) {

- // Ignore non-extractelement scalars.

- if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))

- continue;

- // If all users of instruction are going to be vectorized and this

- // instruction itself is not going to be vectorized, consider this

- // instruction as dead and remove its cost from the final cost of the

- // vectorized tree.

- // Also, avoid adjusting the cost for extractelements with multiple uses

- // in different graph entries.

- const TreeEntry *VE = R.getTreeEntry(V);

- if (!CheckedExtracts.insert(V).second ||

- !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

- (VE && VE != E))

- continue;

- auto *EE = cast<ExtractElementInst>(V);

- VecBase = EE->getVectorOperand();

- std::optional<unsigned> EEIdx = getExtractIndex(EE);

- if (!EEIdx)

- continue;

- unsigned Idx = *EEIdx;

- if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {

- auto It =

- ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;

- It->getSecond() = std::min<int>(It->second, Idx);

- }

- // Take credit for instruction that will become dead.

- if (EE->hasOneUse()) {

- Instruction *Ext = EE->user_back();

- if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {

- return isa<GetElementPtrInst>(U);

- })) {

- // Use getExtractWithExtendCost() to calculate the cost of

- // extractelement/ext pair.

- Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

- EE->getVectorOperandType(), Idx);

- // Add back the cost of s|zext which is subtracted separately.

- Cost += TTI.getCastInstrCost(

- Ext->getOpcode(), Ext->getType(), EE->getType(),

- TTI::getCastContextHint(Ext), CostKind, Ext);

+ // Check if it can be considered reused if same extractelements were

+ // vectorized already.

+ bool PrevNodeFound = any_of(

+ ArrayRef(R.VectorizableTree).take_front(E->Idx),

+ [&](const std::unique_ptr<TreeEntry> &TE) {

+ return ((!TE->isAltShuffle() &&

+ TE->getOpcode() == Instruction::ExtractElement) ||

+ TE->State == TreeEntry::NeedToGather) &&

+ all_of(enumerate(TE->Scalars), [&](auto &&Data) {

+ return VL.size() > Data.index() &&

+ (Mask[Data.index()] == PoisonMaskElem ||

+ isa<UndefValue>(VL[Data.index()]) ||

+ Data.value() == VL[Data.index()]);

+ });

+ SmallPtrSet<Value *, 4> UniqueBases;

+ unsigned SliceSize = VL.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);

+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {

+ // Ignore non-extractelement scalars.

+ if (isa<UndefValue>(V) ||

+ (!SubMask.empty() && SubMask[I] == PoisonMaskElem))

continue;

- }

- Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,

- Idx);

- }

- // Add a cost for subvector extracts/inserts if required.

- for (const auto &Data : ExtractVectorsTys) {

- auto *EEVTy = cast<FixedVectorType>(Data.first->getType());

- unsigned NumElts = VecTy->getNumElements();

- if (Data.second % NumElts == 0)

- continue;

- if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {

- unsigned Idx = (Data.second / NumElts) * NumElts;

- unsigned EENumElts = EEVTy->getNumElements();

- if (Idx % NumElts == 0)

+ // If all users of instruction are going to be vectorized and this

+ // instruction itself is not going to be vectorized, consider this

+ // instruction as dead and remove its cost from the final cost of the

+ // vectorized tree.

+ // Also, avoid adjusting the cost for extractelements with multiple uses

+ // in different graph entries.

+ auto *EE = cast<ExtractElementInst>(V);

+ VecBase = EE->getVectorOperand();

+ UniqueBases.insert(VecBase);

+ const TreeEntry *VE = R.getTreeEntry(V);

+ if (!CheckedExtracts.insert(V).second ||

+ !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||

+ (VE && VE != E))

continue;

- if (Idx + NumElts <= EENumElts) {

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, VecTy);

- } else {

- // Need to round up the subvector type vectorization factor to avoid a

- // crash in cost model functions. Make SubVT so that Idx + VF of SubVT

- // <= EENumElts.

- auto *SubVT =

- FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

- EEVTy, std::nullopt, CostKind, Idx, SubVT);

+ std::optional<unsigned> EEIdx = getExtractIndex(EE);

+ if (!EEIdx)

+ continue;

+ unsigned Idx = *EEIdx;

+ // Take credit for instruction that will become dead.

+ if (EE->hasOneUse() || !PrevNodeFound) {

+ Instruction *Ext = EE->user_back();

+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {

+ return isa<GetElementPtrInst>(U);

+ })) {

+ // Use getExtractWithExtendCost() to calculate the cost of

+ // extractelement/ext pair.

+ Cost -=

+ TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

+ EE->getVectorOperandType(), Idx);

+ // Add back the cost of s|zext which is subtracted separately.

+ Cost += TTI.getCastInstrCost(

+ Ext->getOpcode(), Ext->getType(), EE->getType(),

+ TTI::getCastContextHint(Ext), CostKind, Ext);

+ continue;

+ }

}

- } else {

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,

- VecTy, std::nullopt, CostKind, 0, EEVTy);

+ Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),

+ CostKind, Idx);

}

// Check that gather of extractelements can be represented as just a

@@ -7192,31 +7504,152 @@ public:

// Found the bunch of extractelement instructions that must be gathered

// into a vector and can be represented as a permutation elements in a

// single input vector or of 2 input vectors.

- Cost += computeExtractCost(VL, Mask, ShuffleKind);

+ // Done for reused if same extractelements were vectorized already.

+ if (!PrevNodeFound)

+ Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);

+ InVectors.assign(1, E);

+ CommonMask.assign(Mask.begin(), Mask.end());

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ SameNodesEstimated = false;

+ if (NumParts != 1 && UniqueBases.size() != 1) {

+ UseVecBaseAsInput = true;

+ VecBase = Constant::getNullValue(

+ FixedVectorType::get(VL.front()->getType(), CommonMask.size()));

+ }

return VecBase;

}

- void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign({E1, E2});

+ /// Checks if the specified entry \p E needs to be delayed because of its

+ /// dependency nodes.

+ std::optional<InstructionCost>

+ needToDelay(const TreeEntry *,

+ ArrayRef<SmallVector<const TreeEntry *>>) const {

+ // No need to delay the cost estimation during analysis.

+ return std::nullopt;

}

- void add(const TreeEntry *E1, ArrayRef<int> Mask) {

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign(1, E1);

+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

+ if (&E1 == &E2) {

+ assert(all_of(Mask,

+ [&](int Idx) {

+ return Idx < static_cast<int>(E1.getVectorFactor());

+ }) &&

+ "Expected single vector shuffle mask.");

+ add(E1, Mask);

+ return;

+ }

+ if (InVectors.empty()) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign({&E1, &E2});

+ return;

+ }

+ assert(!CommonMask.empty() && "Expected non-empty common mask.");

+ auto *MaskVecTy =

+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());

+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);

+ if (NumParts == 0 || NumParts >= Mask.size())

+ NumParts = 1;

+ unsigned SliceSize = Mask.size() / NumParts;

+ const auto *It =

+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

+ estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);

+ }

+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {

+ if (InVectors.empty()) {

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, &E1);

+ return;

+ }

+ assert(!CommonMask.empty() && "Expected non-empty common mask.");

+ auto *MaskVecTy =

+ FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());

+ unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);

+ if (NumParts == 0 || NumParts >= Mask.size())

+ NumParts = 1;

+ unsigned SliceSize = Mask.size() / NumParts;

+ const auto *It =

+ find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

+ unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

+ estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);

+ if (!SameNodesEstimated && InVectors.size() == 1)

+ InVectors.emplace_back(&E1);

+ }

+ /// Adds 2 input vectors and the mask for their shuffling.

+ void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

+ // May come only for shuffling of 2 vectors with extractelements, already

+ // handled in adjustExtracts.

+ assert(InVectors.size() == 1 &&

+ all_of(enumerate(CommonMask),

+ [&](auto P) {

+ if (P.value() == PoisonMaskElem)

+ return Mask[P.index()] == PoisonMaskElem;

+ auto *EI =

+ cast<ExtractElementInst>(InVectors.front()

+ .get<const TreeEntry *>()

+ ->Scalars[P.index()]);

+ return EI->getVectorOperand() == V1 ||

+ EI->getVectorOperand() == V2;

+ }) &&

+ "Expected extractelement vectors.");

}

/// Adds another one input vector and the mask for the shuffling.

- void add(Value *V1, ArrayRef<int> Mask) {

- assert(CommonMask.empty() && InVectors.empty() &&

- "Expected empty input mask/vectors.");

- CommonMask.assign(Mask.begin(), Mask.end());

- InVectors.assign(1, V1);

+ void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {

+ if (InVectors.empty()) {

+ assert(CommonMask.empty() && !ForExtracts &&

+ "Expected empty input mask/vectors.");

+ CommonMask.assign(Mask.begin(), Mask.end());

+ InVectors.assign(1, V1);

+ return;

+ }

+ if (ForExtracts) {

+ // No need to add vectors here, already handled them in adjustExtracts.

+ assert(InVectors.size() == 1 &&

+ InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&

+ all_of(enumerate(CommonMask),

+ [&](auto P) {

+ Value *Scalar = InVectors.front()

+ .get<const TreeEntry *>()

+ ->Scalars[P.index()];

+ if (P.value() == PoisonMaskElem)

+ return P.value() == Mask[P.index()] ||

+ isa<UndefValue>(Scalar);

+ if (isa<Constant>(V1))

+ return true;

+ auto *EI = cast<ExtractElementInst>(Scalar);

+ return EI->getVectorOperand() == V1;

+ }) &&

+ "Expected only tree entry for extractelement vectors.");

+ return;

+ }

+ assert(!InVectors.empty() && !CommonMask.empty() &&

+ "Expected only tree entries from extracts/reused buildvectors.");

+ unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();

+ if (InVectors.size() == 2) {

+ Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

+ transformMaskAfterShuffle(CommonMask, CommonMask);

+ VF = std::max<unsigned>(VF, CommonMask.size());

+ } else if (const auto *InTE =

+ InVectors.front().dyn_cast<const TreeEntry *>()) {

+ VF = std::max(VF, InTE->getVectorFactor());

+ } else {

+ VF = std::max(

+ VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())

+ ->getNumElements());

+ }

+ InVectors.push_back(V1);

+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

+ if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

+ CommonMask[Idx] = Mask[Idx] + VF;

}

- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

+ Value *Root = nullptr) {

Cost += getBuildVectorCost(VL, Root);

if (!Root) {

- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");

// FIXME: Need to find a way to avoid use of getNullValue here.

SmallVector<Constant *> Vals;

- for (Value *V : VL) {

+ unsigned VF = VL.size();

+ if (MaskVF != 0)

+ VF = std::min(VF, MaskVF);

+ for (Value *V : VL.take_front(VF)) {

if (isa<UndefValue>(V)) {

Vals.push_back(cast<Constant>(V));

continue;

@@ -7226,9 +7659,11 @@ public:

return ConstantVector::get(Vals);

}

return ConstantVector::getSplat(

- ElementCount::getFixed(VL.size()),

- Constant::getNullValue(VL.front()->getType()));

+ ElementCount::getFixed(

+ cast<FixedVectorType>(Root->getType())->getNumElements()),

+ getAllOnesValue(*R.DL, VL.front()->getType()));

}

+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }

/// Finalize emission of the shuffles.

InstructionCost

finalize(ArrayRef<int> ExtMask, unsigned VF = 0,

@@ -7236,31 +7671,24 @@ public:

IsFinalized = true;

if (Action) {

const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

- if (InVectors.size() == 2) {

+ if (InVectors.size() == 2)

Cost += createShuffle(Vec, InVectors.back(), CommonMask);

- InVectors.pop_back();

- } else {

+ else

Cost += createShuffle(Vec, nullptr, CommonMask);

- }

for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

if (CommonMask[Idx] != PoisonMaskElem)

CommonMask[Idx] = Idx;

assert(VF > 0 &&

"Expected vector length for the final value before action.");

- Value *V = Vec.dyn_cast<Value *>();

- if (!Vec.isNull() && !V)

- V = Constant::getNullValue(FixedVectorType::get(

- Vec.get<const TreeEntry *>()->Scalars.front()->getType(),

- CommonMask.size()));

+ Value *V = Vec.get<Value *>();

Action(V, CommonMask);

+ InVectors.front() = V;

}

::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);

- if (CommonMask.empty())

- return Cost;

- int Limit = CommonMask.size() * 2;

- if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&

- ShuffleVectorInst::isIdentityMask(CommonMask))

+ if (CommonMask.empty()) {

+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");

return Cost;

+ }

return Cost +

createShuffle(InVectors.front(),

InVectors.size() == 2 ? InVectors.back() : nullptr,

@@ -7273,28 +7701,63 @@ public:

}

};

+const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,

+ unsigned Idx) const {

+ Value *Op = E->getOperand(Idx).front();

+ if (const TreeEntry *TE = getTreeEntry(Op)) {

+ if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end())

+ return TE;

+ auto MIt = MultiNodeScalars.find(Op);

+ if (MIt != MultiNodeScalars.end()) {

+ for (const TreeEntry *TE : MIt->second) {

+ if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end())

+ return TE;

+ }

+ const auto *It =

+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->State == TreeEntry::NeedToGather &&

+ find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

+ return EI.EdgeIdx == Idx && EI.UserTE == E;

+ }) != TE->UserTreeIndices.end();

+ });

+ assert(It != VectorizableTree.end() && "Expected vectorizable entry.");

+ return It->get();

InstructionCost

BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

SmallPtrSetImpl<Value *> &CheckedExtracts) {

ArrayRef<Value *> VL = E->Scalars;

Type *ScalarTy = VL[0]->getType();

- if (auto *SI = dyn_cast<StoreInst>(VL[0]))

- ScalarTy = SI->getValueOperand()->getType();

- else if (auto *CI = dyn_cast<CmpInst>(VL[0]))

- ScalarTy = CI->getOperand(0)->getType();

- else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

- ScalarTy = IE->getOperand(1)->getType();

+ if (E->State != TreeEntry::NeedToGather) {

+ if (auto *SI = dyn_cast<StoreInst>(VL[0]))

+ ScalarTy = SI->getValueOperand()->getType();

+ else if (auto *CI = dyn_cast<CmpInst>(VL[0]))

+ ScalarTy = CI->getOperand(0)->getType();

+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

+ ScalarTy = IE->getOperand(1)->getType();

+ }

+ if (!FixedVectorType::isValidElementType(ScalarTy))

+ return InstructionCost::getInvalid();

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// If we have computed a smaller type for the expression, update VecTy so

// that the costs will be accurate.

- if (MinBWs.count(VL[0]))

- VecTy = FixedVectorType::get(

- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

+ auto It = MinBWs.find(E);

+ if (It != MinBWs.end()) {

+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);

+ VecTy = FixedVectorType::get(ScalarTy, VL.size());

+ }

unsigned EntryVF = E->getVectorFactor();

- auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

+ auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

if (E->State == TreeEntry::NeedToGather) {

@@ -7302,121 +7765,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

return 0;

if (isa<InsertElementInst>(VL[0]))

return InstructionCost::getInvalid();

- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,

- CheckedExtracts);

- unsigned VF = E->getVectorFactor();

- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),

- E->ReuseShuffleIndices.end());

- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

- // Build a mask out of the reorder indices and reorder scalars per this

- // mask.

- SmallVector<int> ReorderMask;

- inversePermutation(E->ReorderIndices, ReorderMask);

- if (!ReorderMask.empty())

- reorderScalars(GatheredScalars, ReorderMask);

- SmallVector<int> Mask;

- SmallVector<int> ExtractMask;

- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

- SmallVector<const TreeEntry *> Entries;

- Type *ScalarTy = GatheredScalars.front()->getType();

- // Check for gathered extracts.

- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

- bool Resized = false;

- if (Value *VecBase = Estimator.adjustExtracts(

- E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))

- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

- Resized = true;

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

- // Do not try to look for reshuffled loads for gathered loads (they will be

- // handled later), for vectorized scalars, and cases, which are definitely

- // not profitable (splats and small gather nodes.)

- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

- E->isAltShuffle() ||

- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

- isSplat(E->Scalars) ||

- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))

- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

- if (GatherShuffle) {

- assert((Entries.size() == 1 || Entries.size() == 2) &&

- "Expected shuffle of 1 or 2 entries.");

- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

- Entries.front()->isSame(E->Scalars)) {

- // Perfect match in the graph, will reuse the previously vectorized

- // node. Cost is 0.

- LLVM_DEBUG(

- dbgs()

- << "SLP: perfect diamond match for gather bundle that starts with "

- << *VL.front() << ".\n");

- // Restore the mask for previous partially matched values.

- for (auto [I, V] : enumerate(E->Scalars)) {

- if (isa<PoisonValue>(V)) {

- Mask[I] = PoisonMaskElem;

- continue;

- }

- if (Mask[I] == PoisonMaskElem)

- Mask[I] = Entries.front()->findLaneForValue(V);

- }

- Estimator.add(Entries.front(), Mask);

- return Estimator.finalize(E->ReuseShuffleIndices);

- }

- if (!Resized) {

- unsigned VF1 = Entries.front()->getVectorFactor();

- unsigned VF2 = Entries.back()->getVectorFactor();

- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

- // Remove shuffled elements from list of gathers.

- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

- if (Mask[I] != PoisonMaskElem)

- GatheredScalars[I] = PoisonValue::get(ScalarTy);

- }

- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()

- << " entries for bundle that starts with "

- << *VL.front() << ".\n";);

- if (Entries.size() == 1)

- Estimator.add(Entries.front(), Mask);

- else

- Estimator.add(Entries.front(), Entries.back(), Mask);

- if (all_of(GatheredScalars, PoisonValue ::classof))

- return Estimator.finalize(E->ReuseShuffleIndices);

- return Estimator.finalize(

- E->ReuseShuffleIndices, E->Scalars.size(),

- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {

- Vec = Estimator.gather(GatheredScalars,

- Constant::getNullValue(FixedVectorType::get(

- GatheredScalars.front()->getType(),

- GatheredScalars.size())));

- });

- }

- if (!all_of(GatheredScalars, PoisonValue::classof)) {

- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());

- bool SameGathers = VL.equals(Gathers);

- Value *BV = Estimator.gather(

- Gathers, SameGathers ? nullptr

- : Constant::getNullValue(FixedVectorType::get(

- GatheredScalars.front()->getType(),

- GatheredScalars.size())));

- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);

- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);

- Estimator.add(BV, ReuseMask);

- }

- if (ExtractShuffle)

- Estimator.add(E, std::nullopt);

- return Estimator.finalize(E->ReuseShuffleIndices);

+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(

+ E, *TTI, VectorizedVals, *this, CheckedExtracts);

}

InstructionCost CommonCost = 0;

SmallVector<int> Mask;

- if (!E->ReorderIndices.empty()) {

+ if (!E->ReorderIndices.empty() &&

+ E->State != TreeEntry::PossibleStridedVectorize) {

SmallVector<int> NewMask;

if (E->getOpcode() == Instruction::Store) {

// For stores the order is actually a mask.

@@ -7429,11 +7784,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

if (NeedToShuffleReuses)

::addMask(Mask, E->ReuseShuffleIndices);

- if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))

+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

CommonCost =

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

assert((E->State == TreeEntry::Vectorize ||

- E->State == TreeEntry::ScatterVectorize) &&

+ E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

"Unhandled state");

assert(E->getOpcode() &&

((allSameType(VL) && allSameBlock(VL)) ||

@@ -7443,7 +7799,34 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

Instruction *VL0 = E->getMainOp();

unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

- const unsigned Sz = VL.size();

+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());

+ const unsigned Sz = UniqueValues.size();

+ SmallBitVector UsedScalars(Sz, false);

+ for (unsigned I = 0; I < Sz; ++I) {

+ if (getTreeEntry(UniqueValues[I]) == E)

+ continue;

+ UsedScalars.set(I);

+ }

+ auto GetCastContextHint = [&](Value *V) {

+ if (const TreeEntry *OpTE = getTreeEntry(V)) {

+ if (OpTE->State == TreeEntry::ScatterVectorize)

+ return TTI::CastContextHint::GatherScatter;

+ if (OpTE->State == TreeEntry::Vectorize &&

+ OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {

+ if (OpTE->ReorderIndices.empty())

+ return TTI::CastContextHint::Normal;

+ SmallVector<int> Mask;

+ inversePermutation(OpTE->ReorderIndices, Mask);

+ if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))

+ return TTI::CastContextHint::Reversed;

+ }

+ } else {

+ InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);

+ if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())

+ return TTI::CastContextHint::GatherScatter;

+ }

+ return TTI::CastContextHint::None;

+ };

auto GetCostDiff =

[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,

function_ref<InstructionCost(InstructionCost)> VectorCost) {

@@ -7453,13 +7836,49 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// For some of the instructions no need to calculate cost for each

// particular instruction, we can use the cost of the single

// instruction x total number of scalar instructions.

- ScalarCost = Sz * ScalarEltCost(0);

+ ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);

} else {

- for (unsigned I = 0; I < Sz; ++I)

+ for (unsigned I = 0; I < Sz; ++I) {

+ if (UsedScalars.test(I))

+ continue;

ScalarCost += ScalarEltCost(I);

+ }

}

InstructionCost VecCost = VectorCost(CommonCost);

+ // Check if the current node must be resized, if the parent node is not

+ // resized.

+ if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {

+ const EdgeInfo &EI = E->UserTreeIndices.front();

+ if ((EI.UserTE->getOpcode() != Instruction::Select ||

+ EI.EdgeIdx != 0) &&

+ It != MinBWs.end()) {

+ auto UserBWIt = MinBWs.find(EI.UserTE);

+ Type *UserScalarTy =

+ EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();

+ if (UserBWIt != MinBWs.end())

+ UserScalarTy = IntegerType::get(ScalarTy->getContext(),

+ UserBWIt->second.first);

+ if (ScalarTy != UserScalarTy) {

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);

+ unsigned VecOpcode;

+ auto *SrcVecTy =

+ FixedVectorType::get(UserScalarTy, E->getVectorFactor());

+ if (BWSz > SrcBWSz)

+ VecOpcode = Instruction::Trunc;

+ else

+ VecOpcode =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

+ TTI::CastContextHint CCH = GetCastContextHint(VL0);

+ VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,

+ CostKind);

+ ScalarCost +=

+ Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,

+ CCH, CostKind);

+ }

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

ScalarCost, "Calculated costs for Tree"));

return VecCost - ScalarCost;

@@ -7550,7 +7969,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// Count reused scalars.

InstructionCost ScalarCost = 0;

SmallPtrSet<const TreeEntry *, 4> CountedOps;

- for (Value *V : VL) {

+ for (Value *V : UniqueValues) {

auto *PHI = dyn_cast<PHINode>(V);

if (!PHI)

continue;

@@ -7571,8 +7990,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

case Instruction::ExtractValue:

case Instruction::ExtractElement: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *I = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *I = cast<Instruction>(UniqueValues[Idx]);

VectorType *SrcVecTy;

if (ShuffleOrOp == Instruction::ExtractElement) {

auto *EE = cast<ExtractElementInst>(I);

@@ -7680,8 +8099,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

// need to shift the vector.

// Do not calculate the cost if the actual size is the register size and

// we can merge this shuffle with the following SK_Select.

- auto *InsertVecTy =

- FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);

+ auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);

if (!IsIdentity)

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

InsertVecTy, Mask);

@@ -7697,8 +8115,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));

if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {

if (InsertVecSz != VecSz) {

- auto *ActualVecTy =

- FixedVectorType::get(SrcVecTy->getElementType(), VecSz);

+ auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);

Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,

std::nullopt, CostKind, OffsetBeg - Offset,

InsertVecTy);

@@ -7729,22 +8146,52 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

- return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,

- VI->getOperand(0)->getType(),

+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

+ Type *SrcScalarTy = VL0->getOperand(0)->getType();

+ auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());

+ unsigned Opcode = ShuffleOrOp;

+ unsigned VecOpcode = Opcode;

+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&

+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {

+ // Check if the values are candidates to demote.

+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);

+ if (SrcIt != MinBWs.end()) {

+ SrcBWSz = SrcIt->second.first;

+ SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);

+ SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());

+ }

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ if (BWSz == SrcBWSz) {

+ VecOpcode = Instruction::BitCast;

+ } else if (BWSz < SrcBWSz) {

+ VecOpcode = Instruction::Trunc;

+ } else if (It != MinBWs.end()) {

+ assert(BWSz > SrcBWSz && "Invalid cast!");

+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

+ }

+ auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {

+ // Do not count cost here if minimum bitwidth is in effect and it is just

+ // a bitcast (here it is just a noop).

+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

+ return TTI::TCC_Free;

+ auto *VI = VL0->getOpcode() == Opcode

+ ? cast<Instruction>(UniqueValues[Idx])

+ : nullptr;

+ return TTI->getCastInstrCost(Opcode, VL0->getType(),

+ VL0->getOperand(0)->getType(),

TTI::getCastContextHint(VI), CostKind, VI);

};

auto GetVectorCost = [=](InstructionCost CommonCost) {

- Type *SrcTy = VL0->getOperand(0)->getType();

- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());

- InstructionCost VecCost = CommonCost;

- // Check if the values are candidates to demote.

- if (!MinBWs.count(VL0) || VecTy != SrcVecTy)

- VecCost +=

- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,

- TTI::getCastContextHint(VL0), CostKind, VL0);

- return VecCost;

+ // Do not count cost here if minimum bitwidth is in effect and it is just

+ // a bitcast (here it is just a noop).

+ if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

+ return CommonCost;

+ auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;

+ TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));

+ return CommonCost +

+ TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,

+ VecOpcode == Opcode ? VI : nullptr);

};

return GetCostDiff(GetScalarCost, GetVectorCost);

}

@@ -7761,7 +8208,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

? CmpInst::BAD_FCMP_PREDICATE

: CmpInst::BAD_ICMP_PREDICATE;

auto GetScalarCost = [&](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()

? CmpInst::BAD_FCMP_PREDICATE

: CmpInst::BAD_ICMP_PREDICATE;

@@ -7821,8 +8268,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;

TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));

TTI::OperandValueInfo Op2Info =

@@ -7833,8 +8280,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

};

auto GetVectorCost = [=](InstructionCost CommonCost) {

unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;

- TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);

- TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);

+ TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));

+ TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));

return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,

Op2Info) +

CommonCost;

@@ -7845,23 +8292,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

return CommonCost + GetGEPCostDiff(VL, VL0);

}

case Instruction::Load: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<LoadInst>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<LoadInst>(UniqueValues[Idx]);

return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),

VI->getPointerAddressSpace(), CostKind,

TTI::OperandValueInfo(), VI);

};

auto *LI0 = cast<LoadInst>(VL0);

- auto GetVectorCost = [=](InstructionCost CommonCost) {

+ auto GetVectorCost = [&](InstructionCost CommonCost) {

InstructionCost VecLdCost;

if (E->State == TreeEntry::Vectorize) {

VecLdCost = TTI->getMemoryOpCost(

Instruction::Load, VecTy, LI0->getAlign(),

LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());

} else {

- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");

+ assert((E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

+ "Unknown EntryState");

Align CommonAlignment = LI0->getAlign();

- for (Value *V : VL)

+ for (Value *V : UniqueValues)

CommonAlignment =

std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

VecLdCost = TTI->getGatherScatterOpCost(

@@ -7874,7 +8323,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);

// If this node generates masked gather load then it is not a terminal node.

// Hence address operand cost is estimated separately.

- if (E->State == TreeEntry::ScatterVectorize)

+ if (E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize)

return Cost;

// Estimate cost of GEPs since this tree node is a terminator.

@@ -7887,7 +8337,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

bool IsReorder = !E->ReorderIndices.empty();

auto GetScalarCost = [=](unsigned Idx) {

auto *VI = cast<StoreInst>(VL[Idx]);

- TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);

+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());

return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),

VI->getPointerAddressSpace(), CostKind,

OpInfo, VI);

@@ -7896,7 +8346,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

auto GetVectorCost = [=](InstructionCost CommonCost) {

// We know that we can merge the stores. Calculate the cost.

- TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);

+ TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));

return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),

BaseSI->getPointerAddressSpace(), CostKind,

OpInfo) +

@@ -7912,8 +8362,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());

}

case Instruction::Call: {

- auto GetScalarCost = [=](unsigned Idx) {

- auto *CI = cast<CallInst>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *CI = cast<CallInst>(UniqueValues[Idx]);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

if (ID != Intrinsic::not_intrinsic) {

IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

@@ -7954,8 +8404,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

}

return false;

};

- auto GetScalarCost = [=](unsigned Idx) {

- auto *VI = cast<Instruction>(VL[Idx]);

+ auto GetScalarCost = [&](unsigned Idx) {

+ auto *VI = cast<Instruction>(UniqueValues[Idx]);

assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");

(void)E;

return TTI->getInstructionCost(VI, CostKind);

@@ -7995,21 +8445,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,

TTI::CastContextHint::None, CostKind);

}

- if (E->ReuseShuffleIndices.empty()) {

- VecCost +=

- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);

- } else {

- SmallVector<int> Mask;

- buildShuffleEntryMask(

- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

- [E](Instruction *I) {

- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

- return I->getOpcode() == E->getAltOpcode();

- },

- Mask);

- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

- FinalVecTy, Mask);

- }

+ SmallVector<int> Mask;

+ E->buildAltOpShuffleMask(

+ [E](Instruction *I) {

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ return I->getOpcode() == E->getAltOpcode();

+ },

+ Mask);

+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

+ FinalVecTy, Mask);

return VecCost;

};

return GetCostDiff(GetScalarCost, GetVectorCost);

@@ -8065,7 +8509,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

// Gathering cost would be too much for tiny trees.

if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

(VectorizableTree[1]->State == TreeEntry::NeedToGather &&

- VectorizableTree[0]->State != TreeEntry::ScatterVectorize))

+ VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&

+ VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))

return false;

return true;

@@ -8144,6 +8589,23 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

allConstant(VectorizableTree[1]->Scalars))))

return true;

+ // If the graph includes only PHI nodes and gathers, it is defnitely not

+ // profitable for the vectorization, we can skip it, if the cost threshold is

+ // default. The cost of vectorized PHI nodes is almost always 0 + the cost of

+ // gathers/buildvectors.

+ constexpr int Limit = 4;

+ if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

+ !VectorizableTree.empty() &&

+ all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

+ return (TE->State == TreeEntry::NeedToGather &&

+ TE->getOpcode() != Instruction::ExtractElement &&

+ count_if(TE->Scalars,

+ [](Value *V) { return isa<ExtractElementInst>(V); }) <=

+ Limit) ||

+ TE->getOpcode() == Instruction::PHI;

+ }))

+ return true;

// We can vectorize the tree if its size is greater than or equal to the

// minimum size specified by the MinTreeSize command line option.

if (VectorizableTree.size() >= MinTreeSize)

@@ -8435,16 +8897,6 @@ static T *performExtractsShuffleAction(

}

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

- // Build a map for gathered scalars to the nodes where they are used.

- ValueToGatherNodes.clear();

- for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {

- if (EntryPtr->State != TreeEntry::NeedToGather)

- continue;

- for (Value *V : EntryPtr->Scalars)

- if (!isConstant(V))

- ValueToGatherNodes.try_emplace(V).first->getSecond().insert(

- EntryPtr.get());

- }

InstructionCost Cost = 0;

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

<< VectorizableTree.size() << ".\n");

@@ -8460,8 +8912,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

E->isSame(TE.Scalars)) {

// Some gather nodes might be absolutely the same as some vectorizable

// nodes after reordering, need to handle it.

- LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "

- << *TE.Scalars[0] << ".\n"

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "

+ << shortBundleName(TE.Scalars) << ".\n"

<< "SLP: Current total cost = " << Cost << "\n");

continue;

}

@@ -8469,9 +8921,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

Cost += C;

- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

- << " for bundle that starts with " << *TE.Scalars[0]

- << ".\n"

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "

+ << shortBundleName(TE.Scalars) << ".\n"

<< "SLP: Current total cost = " << Cost << "\n");

}

@@ -8480,6 +8931,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;

SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;

SmallVector<APInt> DemandedElts;

+ SmallDenseSet<Value *, 4> UsedInserts;

+ DenseSet<Value *> VectorCasts;

for (ExternalUser &EU : ExternalUses) {

// We only add extract cost once for the same scalar.

if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

@@ -8500,6 +8953,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// to detect it as a final shuffled/identity match.

if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {

if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

+ if (!UsedInserts.insert(VU).second)

+ continue;

std::optional<unsigned> InsertIdx = getInsertIndex(VU);

if (InsertIdx) {

const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

@@ -8546,6 +9001,28 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

FirstUsers.emplace_back(VU, ScalarTE);

DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

VecId = FirstUsers.size() - 1;

+ auto It = MinBWs.find(ScalarTE);

+ if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {

+ unsigned BWSz = It->second.second;

+ unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());

+ unsigned VecOpcode;

+ if (BWSz < SrcBWSz)

+ VecOpcode = Instruction::Trunc;

+ else

+ VecOpcode =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost C = TTI->getCastInstrCost(

+ VecOpcode, FTy,

+ FixedVectorType::get(

+ IntegerType::get(FTy->getContext(), It->second.first),

+ FTy->getNumElements()),

+ TTI::CastContextHint::None, CostKind);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for extending externally used vector with "

+ "non-equal minimum bitwidth.\n");

+ Cost += C;

+ }

} else {

if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))

It->first = VU;

@@ -8567,11 +9044,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// for the extract and the added cost of the sign extend if needed.

auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

- if (MinBWs.count(ScalarRoot)) {

- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

- auto Extend =

- MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;

+ auto It = MinBWs.find(getTreeEntry(EU.Scalar));

+ if (It != MinBWs.end()) {

+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);

+ unsigned Extend =

+ It->second.second ? Instruction::SExt : Instruction::ZExt;

VecTy = FixedVectorType::get(MinTy, BundleWidth);

ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

VecTy, EU.Lane);

@@ -8580,6 +9057,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

CostKind, EU.Lane);

}

+ // Add reduced value cost, if resized.

+ if (!VectorizedVals.empty()) {

+ auto BWIt = MinBWs.find(VectorizableTree.front().get());

+ if (BWIt != MinBWs.end()) {

+ Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();

+ unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);

+ unsigned Opcode = Instruction::Trunc;

+ if (OriginalSz < BWIt->second.first)

+ Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;

+ Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);

+ Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,

+ TTI::CastContextHint::None,

+ TTI::TCK_RecipThroughput);

+ }

InstructionCost SpillCost = getSpillCost();

Cost += SpillCost + ExtractCost;

@@ -8590,9 +9082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

unsigned VecVF = TE->getVectorFactor();

if (VF != VecVF &&

(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||

- (all_of(Mask,

- [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&

- !ShuffleVectorInst::isIdentityMask(Mask)))) {

+ !ShuffleVectorInst::isIdentityMask(Mask, VF))) {

SmallVector<int> OrigMask(VecVF, PoisonMaskElem);

std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

OrigMask.begin());

@@ -8611,19 +9101,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// Calculate the cost of the reshuffled vectors, if any.

for (int I = 0, E = FirstUsers.size(); I < E; ++I) {

Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);

- unsigned VF = ShuffleMasks[I].begin()->second.size();

- auto *FTy = FixedVectorType::get(

- cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);

auto Vector = ShuffleMasks[I].takeVector();

- auto &&EstimateShufflesCost = [this, FTy,

- &Cost](ArrayRef<int> Mask,

- ArrayRef<const TreeEntry *> TEs) {

+ unsigned VF = 0;

+ auto EstimateShufflesCost = [&](ArrayRef<int> Mask,

+ ArrayRef<const TreeEntry *> TEs) {

assert((TEs.size() == 1 || TEs.size() == 2) &&

"Expected exactly 1 or 2 tree entries.");

if (TEs.size() == 1) {

- int Limit = 2 * Mask.size();

- if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||

- !ShuffleVectorInst::isIdentityMask(Mask)) {

+ if (VF == 0)

+ VF = TEs.front()->getVectorFactor();

+ auto *FTy =

+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);

+ if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&

+ !all_of(enumerate(Mask), [=](const auto &Data) {

+ return Data.value() == PoisonMaskElem ||

+ (Data.index() < VF &&

+ static_cast<int>(Data.index()) == Data.value());

+ })) {

InstructionCost C =

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

@@ -8634,6 +9128,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

Cost += C;

}

} else {

+ if (VF == 0) {

+ if (TEs.front() &&

+ TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())

+ VF = TEs.front()->getVectorFactor();

+ else

+ VF = Mask.size();

+ }

+ auto *FTy =

+ FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);

InstructionCost C =

TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

@@ -8643,6 +9146,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

dbgs() << "SLP: Current total cost = " << Cost << "\n");

Cost += C;

}

+ VF = Mask.size();

return TEs.back();

};

(void)performExtractsShuffleAction<const TreeEntry>(

@@ -8671,54 +9175,198 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

return Cost;

}

-std::optional<TargetTransformInfo::ShuffleKind>

-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

- SmallVectorImpl<int> &Mask,

- SmallVectorImpl<const TreeEntry *> &Entries) {

- Entries.clear();

- // No need to check for the topmost gather node.

- if (TE == VectorizableTree.front().get())

+/// Tries to find extractelement instructions with constant indices from fixed

+/// vector type and gather such instructions into a bunch, which highly likely

+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

+/// successful, the matched scalars are replaced by poison values in \p VL for

+/// future analysis.

+std::optional<TTI::ShuffleKind>

+BoUpSLP::tryToGatherSingleRegisterExtractElements(

+ MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {

+ // Scan list of gathered scalars for extractelements that can be represented

+ // as shuffles.

+ MapVector<Value *, SmallVector<int>> VectorOpToIdx;

+ SmallVector<int> UndefVectorExtracts;

+ for (int I = 0, E = VL.size(); I < E; ++I) {

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI) {

+ if (isa<UndefValue>(VL[I]))

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

+ if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

+ continue;

+ std::optional<unsigned> Idx = getExtractIndex(EI);

+ // Undefined index.

+ if (!Idx) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ SmallBitVector ExtractMask(VecTy->getNumElements(), true);

+ ExtractMask.reset(*Idx);

+ if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

+ UndefVectorExtracts.push_back(I);

+ continue;

+ }

+ VectorOpToIdx[EI->getVectorOperand()].push_back(I);

+ }

+ // Sort the vector operands by the maximum number of uses in extractelements.

+ MapVector<unsigned, SmallVector<Value *>> VFToVector;

+ for (const auto &Data : VectorOpToIdx)

+ VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]

+ .push_back(Data.first);

+ for (auto &Data : VFToVector) {

+ stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {

+ return VectorOpToIdx.find(V1)->second.size() >

+ VectorOpToIdx.find(V2)->second.size();

+ });

+ }

+ // Find the best pair of the vectors with the same number of elements or a

+ // single vector.

+ const int UndefSz = UndefVectorExtracts.size();

+ unsigned SingleMax = 0;

+ Value *SingleVec = nullptr;

+ unsigned PairMax = 0;

+ std::pair<Value *, Value *> PairVec(nullptr, nullptr);

+ for (auto &Data : VFToVector) {

+ Value *V1 = Data.second.front();

+ if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {

+ SingleMax = VectorOpToIdx[V1].size() + UndefSz;

+ SingleVec = V1;

+ }

+ Value *V2 = nullptr;

+ if (Data.second.size() > 1)

+ V2 = *std::next(Data.second.begin());

+ if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +

+ UndefSz) {

+ PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;

+ PairVec = std::make_pair(V1, V2);

+ }

+ if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

+ return std::nullopt;

+ // Check if better to perform a shuffle of 2 vectors or just of a single

+ // vector.

+ SmallVector<Value *> SavedVL(VL.begin(), VL.end());

+ SmallVector<Value *> GatheredExtracts(

+ VL.size(), PoisonValue::get(VL.front()->getType()));

+ if (SingleMax >= PairMax && SingleMax) {

+ for (int Idx : VectorOpToIdx[SingleVec])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ } else {

+ for (Value *V : {PairVec.first, PairVec.second})

+ for (int Idx : VectorOpToIdx[V])

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ }

+ // Add extracts from undefs too.

+ for (int Idx : UndefVectorExtracts)

+ std::swap(GatheredExtracts[Idx], VL[Idx]);

+ // Check that gather of extractelements can be represented as just a

+ // shuffle of a single/two vectors the scalars are extracted from.

+ std::optional<TTI::ShuffleKind> Res =

+ isFixedVectorShuffle(GatheredExtracts, Mask);

+ if (!Res) {

+ // TODO: try to check other subsets if possible.

+ // Restore the original VL if attempt was not successful.

+ copy(SavedVL, VL.begin());

return std::nullopt;

+ }

+ // Restore unused scalars from mask, if some of the extractelements were not

+ // selected for shuffle.

+ for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

+ if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&

+ isa<UndefValue>(GatheredExtracts[I])) {

+ std::swap(VL[I], GatheredExtracts[I]);

+ continue;

+ }

+ auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

+ if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

+ !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

+ is_contained(UndefVectorExtracts, I))

+ continue;

+ }

+ return Res;

+/// Tries to find extractelement instructions with constant indices from fixed

+/// vector type and gather such instructions into a bunch, which highly likely

+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

+/// successful, the matched scalars are replaced by poison values in \p VL for

+/// future analysis.

+SmallVector<std::optional<TTI::ShuffleKind>>

+BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

+ SmallVectorImpl<int> &Mask,

+ unsigned NumParts) const {

+ assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");

+ SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);

Mask.assign(VL.size(), PoisonMaskElem);

- assert(TE->UserTreeIndices.size() == 1 &&

- "Expected only single user of the gather node.");

+ unsigned SliceSize = VL.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ // Scan list of gathered scalars for extractelements that can be represented

+ // as shuffles.

+ MutableArrayRef<Value *> SubVL =

+ MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);

+ SmallVector<int> SubMask;

+ std::optional<TTI::ShuffleKind> Res =

+ tryToGatherSingleRegisterExtractElements(SubVL, SubMask);

+ ShufflesRes[Part] = Res;

+ copy(SubMask, std::next(Mask.begin(), Part * SliceSize));

+ }

+ if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {

+ return Res.has_value();

+ }))

+ ShufflesRes.clear();

+ return ShufflesRes;

+std::optional<TargetTransformInfo::ShuffleKind>

+BoUpSLP::isGatherShuffledSingleRegisterEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {

+ Entries.clear();

// TODO: currently checking only for Scalars in the tree entry, need to count

// reused elements too for better cost estimation.

- Instruction &UserInst =

- getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE);

- BasicBlock *ParentBB = nullptr;

+ const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();

+ const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);

+ const BasicBlock *TEInsertBlock = nullptr;

// Main node of PHI entries keeps the correct order of operands/incoming

// blocks.

- if (auto *PHI =

- dyn_cast<PHINode>(TE->UserTreeIndices.front().UserTE->getMainOp())) {

- ParentBB = PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx);

+ if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {

+ TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);

+ TEInsertPt = TEInsertBlock->getTerminator();

} else {

- ParentBB = UserInst.getParent();

+ TEInsertBlock = TEInsertPt->getParent();

}

- auto *NodeUI = DT->getNode(ParentBB);

+ auto *NodeUI = DT->getNode(TEInsertBlock);

assert(NodeUI && "Should only process reachable instructions");

SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());

- auto CheckOrdering = [&](Instruction *LastEI) {

- // Check if the user node of the TE comes after user node of EntryPtr,

- // otherwise EntryPtr depends on TE.

- // Gather nodes usually are not scheduled and inserted before their first

- // user node. So, instead of checking dependency between the gather nodes

- // themselves, we check the dependency between their user nodes.

- // If one user node comes before the second one, we cannot use the second

- // gather node as the source vector for the first gather node, because in

- // the list of instructions it will be emitted later.

- auto *EntryParent = LastEI->getParent();

- auto *NodeEUI = DT->getNode(EntryParent);

+ auto CheckOrdering = [&](const Instruction *InsertPt) {

+ // Argument InsertPt is an instruction where vector code for some other

+ // tree entry (one that shares one or more scalars with TE) is going to be

+ // generated. This lambda returns true if insertion point of vector code

+ // for the TE dominates that point (otherwise dependency is the other way

+ // around). The other node is not limited to be of a gather kind. Gather

+ // nodes are not scheduled and their vector code is inserted before their

+ // first user. If user is PHI, that is supposed to be at the end of a

+ // predecessor block. Otherwise it is the last instruction among scalars of

+ // the user node. So, instead of checking dependency between instructions

+ // themselves, we check dependency between their insertion points for vector

+ // code (since each scalar instruction ends up as a lane of a vector

+ // instruction).

+ const BasicBlock *InsertBlock = InsertPt->getParent();

+ auto *NodeEUI = DT->getNode(InsertBlock);

if (!NodeEUI)

return false;

assert((NodeUI == NodeEUI) ==

(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&

"Different nodes should have different DFS numbers");

// Check the order of the gather nodes users.

- if (UserInst.getParent() != EntryParent &&

+ if (TEInsertPt->getParent() != InsertBlock &&

(DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))

return false;

- if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI))

+ if (TEInsertPt->getParent() == InsertBlock &&

+ TEInsertPt->comesBefore(InsertPt))

return false;

return true;

};

@@ -8743,43 +9391,42 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

[&](Value *V) { return GatheredScalars.contains(V); }) &&

"Must contain at least single gathered value.");

assert(TEPtr->UserTreeIndices.size() == 1 &&

- "Expected only single user of the gather node.");

- PHINode *EntryPHI =

- dyn_cast<PHINode>(TEPtr->UserTreeIndices.front().UserTE->getMainOp());

- Instruction *EntryUserInst =

- EntryPHI ? nullptr

- : &getLastInstructionInBundle(

- TEPtr->UserTreeIndices.front().UserTE);

- if (&UserInst == EntryUserInst) {

- assert(!EntryPHI && "Unexpected phi node entry.");

- // If 2 gathers are operands of the same entry, compare operands

- // indices, use the earlier one as the base.

- if (TE->UserTreeIndices.front().UserTE ==

- TEPtr->UserTreeIndices.front().UserTE &&

- TE->UserTreeIndices.front().EdgeIdx <

- TEPtr->UserTreeIndices.front().EdgeIdx)

+ "Expected only single user of a gather node.");

+ const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();

+ PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());

+ const Instruction *InsertPt =

+ UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()

+ : &getLastInstructionInBundle(UseEI.UserTE);

+ if (TEInsertPt == InsertPt) {

+ // If 2 gathers are operands of the same entry (regardless of whether

+ // user is PHI or else), compare operands indices, use the earlier one

+ // as the base.

+ if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)

+ continue;

+ // If the user instruction is used for some reason in different

+ // vectorized nodes - make it depend on index.

+ if (TEUseEI.UserTE != UseEI.UserTE &&

+ TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)

continue;

}

- // Check if the user node of the TE comes after user node of EntryPtr,

- // otherwise EntryPtr depends on TE.

- auto *EntryI =

- EntryPHI

- ? EntryPHI

- ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx)

- ->getTerminator()

- : EntryUserInst;

- if ((ParentBB != EntryI->getParent() ||

- TE->UserTreeIndices.front().EdgeIdx <

- TEPtr->UserTreeIndices.front().EdgeIdx ||

- TE->UserTreeIndices.front().UserTE !=

- TEPtr->UserTreeIndices.front().UserTE) &&

- !CheckOrdering(EntryI))

+ // Check if the user node of the TE comes after user node of TEPtr,

+ // otherwise TEPtr depends on TE.

+ if ((TEInsertBlock != InsertPt->getParent() ||

+ TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&

+ !CheckOrdering(InsertPt))

continue;

VToTEs.insert(TEPtr);

}

if (const TreeEntry *VTE = getTreeEntry(V)) {

- Instruction &EntryUserInst = getLastInstructionInBundle(VTE);

- if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst))

+ Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

+ if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

+ continue;

+ auto It = MinBWs.find(VTE);

+ // If vectorize node is demoted - do not match.

+ if (It != MinBWs.end() &&

+ It->second.first != DL->getTypeSizeInBits(V->getType()))

continue;

VToTEs.insert(VTE);

}

@@ -8823,8 +9470,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

}

- if (UsedTEs.empty())

+ if (UsedTEs.empty()) {

+ Entries.clear();

return std::nullopt;

+ }

unsigned VF = 0;

if (UsedTEs.size() == 1) {

@@ -8838,9 +9487,19 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

});

- if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {

+ if (It != FirstEntries.end() &&

+ ((*It)->getVectorFactor() == VL.size() ||

+ ((*It)->getVectorFactor() == TE->Scalars.size() &&

+ TE->ReuseShuffleIndices.size() == VL.size() &&

+ (*It)->isSame(TE->Scalars)))) {

Entries.push_back(*It);

- std::iota(Mask.begin(), Mask.end(), 0);

+ if ((*It)->getVectorFactor() == VL.size()) {

+ std::iota(std::next(Mask.begin(), Part * VL.size()),

+ std::next(Mask.begin(), (Part + 1) * VL.size()), 0);

+ } else {

+ SmallVector<int> CommonMask = TE->getCommonMask();

+ copy(CommonMask, Mask.begin());

+ }

// Clear undef scalars.

for (int I = 0, Sz = VL.size(); I < Sz; ++I)

if (isa<PoisonValue>(VL[I]))

@@ -8923,12 +9582,9 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// by extractelements processing) or may form vector node in future.

auto MightBeIgnored = [=](Value *V) {

auto *I = dyn_cast<Instruction>(V);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&

!isVectorLikeInstWithConstOps(I) &&

- !areAllUsersVectorized(I, IgnoredVals) && isSimple(I);

+ !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);

};

// Check that the neighbor instruction may form a full vector node with the

// current instruction V. It is possible, if they have same/alternate opcode

@@ -8980,7 +9636,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

TempEntries.push_back(Entries[I]);

}

Entries.swap(TempEntries);

- if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {

+ if (EntryLanes.size() == Entries.size() &&

+ !VL.equals(ArrayRef(TE->Scalars)

+ .slice(Part * VL.size(),

+ std::min<int>(VL.size(), TE->Scalars.size())))) {

// We may have here 1 or 2 entries only. If the number of scalars is equal

// to the number of entries, no need to do the analysis, it is not very

// profitable. Since VL is not the same as TE->Scalars, it means we already

@@ -8993,9 +9652,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

// Pair.first is the offset to the vector, while Pair.second is the index of

// scalar in the list.

for (const std::pair<unsigned, int> &Pair : EntryLanes) {

- Mask[Pair.second] = Pair.first * VF +

- Entries[Pair.first]->findLaneForValue(VL[Pair.second]);

- IsIdentity &= Mask[Pair.second] == Pair.second;

+ unsigned Idx = Part * VL.size() + Pair.second;

+ Mask[Idx] = Pair.first * VF +

+ Entries[Pair.first]->findLaneForValue(VL[Pair.second]);

+ IsIdentity &= Mask[Idx] == Pair.second;

}

switch (Entries.size()) {

case 1:

@@ -9010,9 +9670,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,

break;

}

Entries.clear();

+ // Clear the corresponding mask elements.

+ std::fill(std::next(Mask.begin(), Part * VL.size()),

+ std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);

return std::nullopt;

}

+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

+BoUpSLP::isGatherShuffledEntry(

+ const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

+ SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

+ unsigned NumParts) {

+ assert(NumParts > 0 && NumParts < VL.size() &&

+ "Expected positive number of registers.");

+ Entries.clear();

+ // No need to check for the topmost gather node.

+ if (TE == VectorizableTree.front().get())

+ return {};

+ Mask.assign(VL.size(), PoisonMaskElem);

+ assert(TE->UserTreeIndices.size() == 1 &&

+ "Expected only single user of the gather node.");

+ assert(VL.size() % NumParts == 0 &&

+ "Number of scalars must be divisible by NumParts.");

+ unsigned SliceSize = VL.size() / NumParts;

+ SmallVector<std::optional<TTI::ShuffleKind>> Res;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);

+ SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();

+ std::optional<TTI::ShuffleKind> SubRes =

+ isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);

+ if (!SubRes)

+ SubEntries.clear();

+ Res.push_back(SubRes);

+ if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&

+ SubEntries.front()->getVectorFactor() == VL.size() &&

+ (SubEntries.front()->isSame(TE->Scalars) ||

+ SubEntries.front()->isSame(VL))) {

+ SmallVector<const TreeEntry *> LocalSubEntries;

+ LocalSubEntries.swap(SubEntries);

+ Entries.clear();

+ Res.clear();

+ std::iota(Mask.begin(), Mask.end(), 0);

+ // Clear undef scalars.

+ for (int I = 0, Sz = VL.size(); I < Sz; ++I)

+ if (isa<PoisonValue>(VL[I]))

+ Mask[I] = PoisonMaskElem;

+ Entries.emplace_back(1, LocalSubEntries.front());

+ Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);

+ return Res;

+ }

+ if (all_of(Res,

+ [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {

+ Entries.clear();

+ return {};

+ }

+ return Res;

InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,

bool ForPoisonSrc) const {

// Find the type of the operands in VL.

@@ -9224,18 +9939,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

auto *Front = E->getMainOp();

Instruction *LastInst = &getLastInstructionInBundle(E);

assert(LastInst && "Failed to find last instruction in bundle");

+ BasicBlock::iterator LastInstIt = LastInst->getIterator();

// If the instruction is PHI, set the insert point after all the PHIs.

bool IsPHI = isa<PHINode>(LastInst);

if (IsPHI)

- LastInst = LastInst->getParent()->getFirstNonPHI();

+ LastInstIt = LastInst->getParent()->getFirstNonPHIIt();

if (IsPHI || (E->State != TreeEntry::NeedToGather &&

doesNotNeedToSchedule(E->Scalars))) {

- Builder.SetInsertPoint(LastInst);

+ Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);

} else {

// Set the insertion point after the last instruction in the bundle. Set the

// debug location to Front.

- Builder.SetInsertPoint(LastInst->getParent(),

- std::next(LastInst->getIterator()));

+ Builder.SetInsertPoint(

+ LastInst->getParent(),

+ LastInst->getNextNonDebugInstruction()->getIterator());

}

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

}

@@ -9271,10 +9988,12 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {

GatherShuffleExtractSeq.insert(InsElt);

CSEBlocks.insert(InsElt->getParent());

// Add to our 'need-to-extract' list.

- if (TreeEntry *Entry = getTreeEntry(V)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(V);

- ExternalUses.emplace_back(V, InsElt, FoundLane);

+ if (isa<Instruction>(V)) {

+ if (TreeEntry *Entry = getTreeEntry(V)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(V);

+ ExternalUses.emplace_back(V, InsElt, FoundLane);

+ }

}

return Vec;

};

@@ -9367,12 +10086,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

/// Holds all of the instructions that we gathered.

SetVector<Instruction *> &GatherShuffleExtractSeq;

/// A list of blocks that we are going to CSE.

- SetVector<BasicBlock *> &CSEBlocks;

+ DenseSet<BasicBlock *> &CSEBlocks;

public:

ShuffleIRBuilder(IRBuilderBase &Builder,

SetVector<Instruction *> &GatherShuffleExtractSeq,

- SetVector<BasicBlock *> &CSEBlocks)

+ DenseSet<BasicBlock *> &CSEBlocks)

: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),

CSEBlocks(CSEBlocks) {}

~ShuffleIRBuilder() = default;

@@ -9392,7 +10111,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

return V1;

unsigned VF = Mask.size();

unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();

- if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))

+ if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))

return V1;

Value *Vec = Builder.CreateShuffleVector(V1, Mask);

if (auto *I = dyn_cast<Instruction>(Vec)) {

@@ -9455,7 +10174,11 @@ public:

: Builder(Builder), R(R) {}

/// Adjusts extractelements after reusing them.

- Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {

+ Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

+ unsigned NumParts, bool &UseVecBaseAsInput) {

+ UseVecBaseAsInput = false;

+ SmallPtrSet<Value *, 4> UniqueBases;

Value *VecBase = nullptr;

for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

int Idx = Mask[I];

@@ -9463,6 +10186,10 @@ public:

continue;

auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

VecBase = EI->getVectorOperand();

+ if (const TreeEntry *TE = R.getTreeEntry(VecBase))

+ VecBase = TE->VectorizedValue;

+ assert(VecBase && "Expected vectorized value.");

+ UniqueBases.insert(VecBase);

// If the only one use is vectorized - can delete the extractelement

// itself.

if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {

@@ -9471,14 +10198,97 @@ public:

continue;

R.eraseInstruction(EI);

}

- return VecBase;

+ if (NumParts == 1 || UniqueBases.size() == 1)

+ return VecBase;

+ UseVecBaseAsInput = true;

+ auto TransformToIdentity = [](MutableArrayRef<int> Mask) {

+ for (auto [I, Idx] : enumerate(Mask))

+ if (Idx != PoisonMaskElem)

+ Idx = I;

+ };

+ // Perform multi-register vector shuffle, joining them into a single virtual

+ // long vector.

+ // Need to shuffle each part independently and then insert all this parts

+ // into a long virtual vector register, forming the original vector.

+ Value *Vec = nullptr;

+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

+ unsigned SliceSize = E->Scalars.size() / NumParts;

+ for (unsigned Part = 0; Part < NumParts; ++Part) {

+ ArrayRef<Value *> VL =

+ ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);

+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);

+ constexpr int MaxBases = 2;

+ SmallVector<Value *, MaxBases> Bases(MaxBases);

+#ifndef NDEBUG

+ int PrevSize = 0;

+#endif // NDEBUG

+ for (const auto [I, V]: enumerate(VL)) {

+ if (SubMask[I] == PoisonMaskElem)

+ continue;

+ Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();

+ if (const TreeEntry *TE = R.getTreeEntry(VecOp))

+ VecOp = TE->VectorizedValue;

+ assert(VecOp && "Expected vectorized value.");

+ const int Size =

+ cast<FixedVectorType>(VecOp->getType())->getNumElements();

+#ifndef NDEBUG

+ assert((PrevSize == Size || PrevSize == 0) &&

+ "Expected vectors of the same size.");

+ PrevSize = Size;

+#endif // NDEBUG

+ Bases[SubMask[I] < Size ? 0 : 1] = VecOp;

+ }

+ if (!Bases.front())

+ continue;

+ Value *SubVec;

+ if (Bases.back()) {

+ SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);

+ TransformToIdentity(SubMask);

+ } else {

+ SubVec = Bases.front();

+ }

+ if (!Vec) {

+ Vec = SubVec;

+ assert((Part == 0 || all_of(seq<unsigned>(0, Part),

+ [&](unsigned P) {

+ ArrayRef<int> SubMask =

+ Mask.slice(P * SliceSize, SliceSize);

+ return all_of(SubMask, [](int Idx) {

+ return Idx == PoisonMaskElem;

+ });

+ })) &&

+ "Expected first part or all previous parts masked.");

+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

+ } else {

+ unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();

+ if (Vec->getType() != SubVec->getType()) {

+ unsigned SubVecVF =

+ cast<FixedVectorType>(SubVec->getType())->getNumElements();

+ VF = std::max(VF, SubVecVF);

+ }

+ // Adjust SubMask.

+ for (auto [I, Idx] : enumerate(SubMask))

+ if (Idx != PoisonMaskElem)

+ Idx += VF;

+ copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

+ Vec = createShuffle(Vec, SubVec, VecMask);

+ TransformToIdentity(VecMask);

+ }

+ copy(VecMask, Mask.begin());

+ return Vec;

}

/// Checks if the specified entry \p E needs to be delayed because of its

/// dependency nodes.

- Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {

+ std::optional<Value *>

+ needToDelay(const TreeEntry *E,

+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {

// No need to delay emission if all deps are ready.

- if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))

- return nullptr;

+ if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {

+ return all_of(

+ TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });

+ }))

+ return std::nullopt;

// Postpone gather emission, will be emitted after the end of the

// process to keep correct order.

auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),

@@ -9487,6 +10297,16 @@ public:

VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),

MaybeAlign());

}

+ /// Adds 2 input vectors (in form of tree entries) and the mask for their

+ /// shuffling.

+ void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

+ add(E1.VectorizedValue, E2.VectorizedValue, Mask);

+ }

+ /// Adds single input vector (in form of tree entry) and the mask for its

+ /// shuffling.

+ void add(const TreeEntry &E1, ArrayRef<int> Mask) {

+ add(E1.VectorizedValue, Mask);

+ }

/// Adds 2 input vectors and the mask for their shuffling.

void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");

@@ -9516,7 +10336,7 @@ public:

InVectors.push_back(V1);

}

/// Adds another one input vector and the mask for the shuffling.

- void add(Value *V1, ArrayRef<int> Mask) {

+ void add(Value *V1, ArrayRef<int> Mask, bool = false) {

if (InVectors.empty()) {

if (!isa<FixedVectorType>(V1->getType())) {

V1 = createShuffle(V1, nullptr, CommonMask);

@@ -9578,7 +10398,8 @@ public:

inversePermutation(Order, NewMask);

add(V1, NewMask);

}

- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {

+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

+ Value *Root = nullptr) {

return R.gather(VL, Root);

}

Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }

@@ -9639,8 +10460,14 @@ public:

}

};

-Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

- ArrayRef<Value *> VL = E->getOperand(NodeIdx);

+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,

+ bool PostponedPHIs) {

+ ValueList &VL = E->getOperand(NodeIdx);

+ if (E->State == TreeEntry::PossibleStridedVectorize &&

+ !E->ReorderIndices.empty()) {

+ SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());

+ reorderScalars(VL, Mask);

+ }

const unsigned VF = VL.size();

InstructionsState S = getSameOpcode(VL, *TLI);

// Special processing for GEPs bundle, which may include non-gep values.

@@ -9651,23 +10478,39 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

S = getSameOpcode(*It, *TLI);

}

if (S.getOpcode()) {

- if (TreeEntry *VE = getTreeEntry(S.OpValue);

- VE && VE->isSame(VL) &&

- (any_of(VE->UserTreeIndices,

- [E, NodeIdx](const EdgeInfo &EI) {

- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

- }) ||

- any_of(VectorizableTree,

- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

- return TE->isOperandGatherNode({E, NodeIdx}) &&

- VE->isSame(TE->Scalars);

- }))) {

+ auto CheckSameVE = [&](const TreeEntry *VE) {

+ return VE->isSame(VL) &&

+ (any_of(VE->UserTreeIndices,

+ [E, NodeIdx](const EdgeInfo &EI) {

+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

+ }) ||

+ any_of(VectorizableTree,

+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

+ return TE->isOperandGatherNode({E, NodeIdx}) &&

+ VE->isSame(TE->Scalars);

+ }));

+ };

+ TreeEntry *VE = getTreeEntry(S.OpValue);

+ bool IsSameVE = VE && CheckSameVE(VE);

+ if (!IsSameVE) {

+ auto It = MultiNodeScalars.find(S.OpValue);

+ if (It != MultiNodeScalars.end()) {

+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {

+ return TE != VE && CheckSameVE(TE);

+ });

+ if (I != It->getSecond().end()) {

+ VE = *I;

+ IsSameVE = true;

+ }

+ if (IsSameVE) {

auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {

ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

ShuffleBuilder.add(V, Mask);

return ShuffleBuilder.finalize(std::nullopt);

};

- Value *V = vectorizeTree(VE);

+ Value *V = vectorizeTree(VE, PostponedPHIs);

if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {

if (!VE->ReuseShuffleIndices.empty()) {

// Reshuffle to get only unique values.

@@ -9740,14 +10583,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

assert(I->get()->UserTreeIndices.size() == 1 &&

"Expected only single user for the gather node.");

assert(I->get()->isSame(VL) && "Expected same list of scalars.");

- IRBuilder<>::InsertPointGuard Guard(Builder);

- if (E->getOpcode() != Instruction::InsertElement &&

- E->getOpcode() != Instruction::PHI) {

- Instruction *LastInst = &getLastInstructionInBundle(E);

- assert(LastInst && "Failed to find last instruction in bundle");

- Builder.SetInsertPoint(LastInst);

- }

- return vectorizeTree(I->get());

+ return vectorizeTree(I->get(), PostponedPHIs);

}

template <typename BVTy, typename ResTy, typename... Args>

@@ -9765,7 +10601,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

inversePermutation(E->ReorderIndices, ReorderMask);

if (!ReorderMask.empty())

reorderScalars(GatheredScalars, ReorderMask);

- auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {

+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {

if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {

return isa<UndefValue>(V) && !isa<PoisonValue>(V);

}))

@@ -9782,70 +10618,102 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

});

if (It == VectorizableTree.end())

return false;

- unsigned I =

- *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

- int Sz = Mask.size();

- if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&

- ShuffleVectorInst::isIdentityMask(Mask))

+ int Idx;

+ if ((Mask.size() < InputVF &&

+ ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&

+ Idx == 0) ||

+ (Mask.size() == InputVF &&

+ ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {

std::iota(Mask.begin(), Mask.end(), 0);

- else

+ } else {

+ unsigned I =

+ *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

std::fill(Mask.begin(), Mask.end(), I);

+ }

return true;

};

BVTy ShuffleBuilder(Params...);

ResTy Res = ResTy();

SmallVector<int> Mask;

- SmallVector<int> ExtractMask;

- std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;

- std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;

- SmallVector<const TreeEntry *> Entries;

+ SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);

+ SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;

+ Value *ExtractVecBase = nullptr;

+ bool UseVecBaseAsInput = false;

+ SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;

+ SmallVector<SmallVector<const TreeEntry *>> Entries;

Type *ScalarTy = GatheredScalars.front()->getType();

+ auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());

+ unsigned NumParts = TTI->getNumberOfParts(VecTy);

+ if (NumParts == 0 || NumParts >= GatheredScalars.size())

+ NumParts = 1;

if (!all_of(GatheredScalars, UndefValue::classof)) {

// Check for gathered extracts.

- ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);

- SmallVector<Value *> IgnoredVals;

- if (UserIgnoreList)

- IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());

bool Resized = false;

- if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))

- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

- if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {

- Resized = true;

- GatheredScalars.append(VF - GatheredScalars.size(),

- PoisonValue::get(ScalarTy));

- }

+ ExtractShuffles =

+ tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

+ if (!ExtractShuffles.empty()) {

+ SmallVector<const TreeEntry *> ExtractEntries;

+ for (auto [Idx, I] : enumerate(ExtractMask)) {

+ if (I == PoisonMaskElem)

+ continue;

+ if (const auto *TE = getTreeEntry(

+ cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))

+ ExtractEntries.push_back(TE);

+ }

+ if (std::optional<ResTy> Delayed =

+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {

+ // Delay emission of gathers which are not ready yet.

+ PostponedGathers.insert(E);

+ // Postpone gather emission, will be emitted after the end of the

+ // process to keep correct order.

+ return *Delayed;

+ }

+ if (Value *VecBase = ShuffleBuilder.adjustExtracts(

+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {

+ ExtractVecBase = VecBase;

+ if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

+ if (VF == VecBaseTy->getNumElements() &&

+ GatheredScalars.size() != VF) {

+ Resized = true;

+ GatheredScalars.append(VF - GatheredScalars.size(),

+ PoisonValue::get(ScalarTy));

+ }

// Gather extracts after we check for full matched gathers only.

- if (ExtractShuffle || E->getOpcode() != Instruction::Load ||

+ if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||

E->isAltShuffle() ||

all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

isSplat(E->Scalars) ||

(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

- GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);

+ GatherShuffles =

+ isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);

}

- if (GatherShuffle) {

- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {

+ if (!GatherShuffles.empty()) {

+ if (std::optional<ResTy> Delayed =

+ ShuffleBuilder.needToDelay(E, Entries)) {

// Delay emission of gathers which are not ready yet.

PostponedGathers.insert(E);

// Postpone gather emission, will be emitted after the end of the

// process to keep correct order.

- return Delayed;

+ return *Delayed;

}

- assert((Entries.size() == 1 || Entries.size() == 2) &&

- "Expected shuffle of 1 or 2 entries.");

- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&

- Entries.front()->isSame(E->Scalars)) {

+ if (GatherShuffles.size() == 1 &&

+ *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&

+ Entries.front().front()->isSame(E->Scalars)) {

// Perfect match in the graph, will reuse the previously vectorized

// node. Cost is 0.

LLVM_DEBUG(

dbgs()

- << "SLP: perfect diamond match for gather bundle that starts with "

- << *E->Scalars.front() << ".\n");

+ << "SLP: perfect diamond match for gather bundle "

+ << shortBundleName(E->Scalars) << ".\n");

// Restore the mask for previous partially matched values.

- if (Entries.front()->ReorderIndices.empty() &&

- ((Entries.front()->ReuseShuffleIndices.empty() &&

- E->Scalars.size() == Entries.front()->Scalars.size()) ||

- (E->Scalars.size() ==

- Entries.front()->ReuseShuffleIndices.size()))) {

+ Mask.resize(E->Scalars.size());

+ const TreeEntry *FrontTE = Entries.front().front();

+ if (FrontTE->ReorderIndices.empty() &&

+ ((FrontTE->ReuseShuffleIndices.empty() &&

+ E->Scalars.size() == FrontTE->Scalars.size()) ||

+ (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {

std::iota(Mask.begin(), Mask.end(), 0);

} else {

for (auto [I, V] : enumerate(E->Scalars)) {

@@ -9853,17 +10721,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

Mask[I] = PoisonMaskElem;

continue;

}

- Mask[I] = Entries.front()->findLaneForValue(V);

+ Mask[I] = FrontTE->findLaneForValue(V);

}

- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

+ ShuffleBuilder.add(*FrontTE, Mask);

Res = ShuffleBuilder.finalize(E->getCommonMask());

return Res;

}

if (!Resized) {

- unsigned VF1 = Entries.front()->getVectorFactor();

- unsigned VF2 = Entries.back()->getVectorFactor();

- if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)

+ if (GatheredScalars.size() != VF &&

+ any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {

+ return any_of(TEs, [&](const TreeEntry *TE) {

+ return TE->getVectorFactor() == VF;

+ });

+ }))

GatheredScalars.append(VF - GatheredScalars.size(),

PoisonValue::get(ScalarTy));

}

@@ -9943,78 +10814,108 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

if (It != Scalars.end()) {

// Replace undefs by the non-poisoned scalars and emit broadcast.

int Pos = std::distance(Scalars.begin(), It);

- for_each(UndefPos, [&](int I) {

+ for (int I : UndefPos) {

// Set the undef position to the non-poisoned scalar.

ReuseMask[I] = Pos;

// Replace the undef by the poison, in the mask it is replaced by

// non-poisoned scalar already.

if (I != Pos)

Scalars[I] = PoisonValue::get(ScalarTy);

- });

+ }

} else {

// Replace undefs by the poisons, emit broadcast and then emit

// freeze.

- for_each(UndefPos, [&](int I) {

+ for (int I : UndefPos) {

ReuseMask[I] = PoisonMaskElem;

if (isa<UndefValue>(Scalars[I]))

Scalars[I] = PoisonValue::get(ScalarTy);

- });

+ }

NeedFreeze = true;

}

};

- if (ExtractShuffle || GatherShuffle) {

+ if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {

bool IsNonPoisoned = true;

- bool IsUsedInExpr = false;

+ bool IsUsedInExpr = true;

Value *Vec1 = nullptr;

- if (ExtractShuffle) {

+ if (!ExtractShuffles.empty()) {

// Gather of extractelements can be represented as just a shuffle of

// a single/two vectors the scalars are extracted from.

// Find input vectors.

Value *Vec2 = nullptr;

for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

- if (ExtractMask[I] == PoisonMaskElem ||

- (!Mask.empty() && Mask[I] != PoisonMaskElem)) {

+ if (!Mask.empty() && Mask[I] != PoisonMaskElem)

ExtractMask[I] = PoisonMaskElem;

- continue;

- }

- if (isa<UndefValue>(E->Scalars[I]))

- continue;

- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

- if (!Vec1) {

- Vec1 = EI->getVectorOperand();

- } else if (Vec1 != EI->getVectorOperand()) {

- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&

- "Expected only 1 or 2 vectors shuffle.");

- Vec2 = EI->getVectorOperand();

+ }

+ if (UseVecBaseAsInput) {

+ Vec1 = ExtractVecBase;

+ } else {

+ for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

+ if (ExtractMask[I] == PoisonMaskElem)

+ continue;

+ if (isa<UndefValue>(E->Scalars[I]))

+ continue;

+ auto *EI = cast<ExtractElementInst>(E->Scalars[I]);

+ Value *VecOp = EI->getVectorOperand();

+ if (const auto *TE = getTreeEntry(VecOp))

+ if (TE->VectorizedValue)

+ VecOp = TE->VectorizedValue;

+ if (!Vec1) {

+ Vec1 = VecOp;

+ } else if (Vec1 != EI->getVectorOperand()) {

+ assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&

+ "Expected only 1 or 2 vectors shuffle.");

+ Vec2 = VecOp;

+ }

}

if (Vec2) {

+ IsUsedInExpr = false;

IsNonPoisoned &=

isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);

ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

} else if (Vec1) {

- IsUsedInExpr = FindReusedSplat(ExtractMask);

- ShuffleBuilder.add(Vec1, ExtractMask);

+ IsUsedInExpr &= FindReusedSplat(

+ ExtractMask,

+ cast<FixedVectorType>(Vec1->getType())->getNumElements());

+ ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);

IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);

} else {

+ IsUsedInExpr = false;

ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(

ScalarTy, GatheredScalars.size())),

- ExtractMask);

+ ExtractMask, /*ForExtracts=*/true);

}

- if (GatherShuffle) {

- if (Entries.size() == 1) {

- IsUsedInExpr = FindReusedSplat(Mask);

- ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);

- IsNonPoisoned &=

- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);

- } else {

- ShuffleBuilder.add(Entries.front()->VectorizedValue,

- Entries.back()->VectorizedValue, Mask);

- IsNonPoisoned &=

- isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&

- isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);

+ if (!GatherShuffles.empty()) {

+ unsigned SliceSize = E->Scalars.size() / NumParts;

+ SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

+ for (const auto [I, TEs] : enumerate(Entries)) {

+ if (TEs.empty()) {

+ assert(!GatherShuffles[I] &&

+ "No shuffles with empty entries list expected.");

+ continue;

+ }

+ assert((TEs.size() == 1 || TEs.size() == 2) &&

+ "Expected shuffle of 1 or 2 entries.");

+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);

+ VecMask.assign(VecMask.size(), PoisonMaskElem);

+ copy(SubMask, std::next(VecMask.begin(), I * SliceSize));

+ if (TEs.size() == 1) {

+ IsUsedInExpr &=

+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor());

+ ShuffleBuilder.add(*TEs.front(), VecMask);

+ if (TEs.front()->VectorizedValue)

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);

+ } else {

+ IsUsedInExpr = false;

+ ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);

+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)

+ IsNonPoisoned &=

+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&

+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);

+ }

}

// Try to figure out best way to combine values: build a shuffle and insert

@@ -10025,16 +10926,24 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

int MSz = Mask.size();

// Try to build constant vector and shuffle with it only if currently we

// have a single permutation and more than 1 scalar constants.

- bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;

+ bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();

bool IsIdentityShuffle =

- (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

- TTI::SK_PermuteSingleSrc &&

+ ((UseVecBaseAsInput ||

+ all_of(ExtractShuffles,

+ [](const std::optional<TTI::ShuffleKind> &SK) {

+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc;

+ })) &&

none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&

- ShuffleVectorInst::isIdentityMask(ExtractMask)) ||

- (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==

- TTI::SK_PermuteSingleSrc &&

+ ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||

+ (!GatherShuffles.empty() &&

+ all_of(GatherShuffles,

+ [](const std::optional<TTI::ShuffleKind> &SK) {

+ return SK.value_or(TTI::SK_PermuteTwoSrc) ==

+ TTI::SK_PermuteSingleSrc;

+ }) &&

none_of(Mask, [&](int I) { return I >= MSz; }) &&

- ShuffleVectorInst::isIdentityMask(Mask));

+ ShuffleVectorInst::isIdentityMask(Mask, MSz));

bool EnoughConstsForShuffle =

IsSingleShuffle &&

(none_of(GatheredScalars,

@@ -10064,7 +10973,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

if (!all_of(GatheredScalars, PoisonValue::classof)) {

SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);

TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);

- Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());

ShuffleBuilder.add(BV, BVMask);

}

if (all_of(NonConstants, [=](Value *V) {

@@ -10078,13 +10987,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {

E->ReuseShuffleIndices, E->Scalars.size(),

[&](Value *&Vec, SmallVectorImpl<int> &Mask) {

TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);

- Vec = ShuffleBuilder.gather(NonConstants, Vec);

+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);

});

} else if (!allConstant(GatheredScalars)) {

// Gather unique scalars and all constants.

SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);

TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);

- Value *BV = ShuffleBuilder.gather(GatheredScalars);

+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());

ShuffleBuilder.add(BV, ReuseMask);

Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);

} else {

@@ -10109,10 +11018,12 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {

*this);

}

-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

+Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {

IRBuilder<>::InsertPointGuard Guard(Builder);

- if (E->VectorizedValue) {

+ if (E->VectorizedValue &&

+ (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||

+ E->isAltShuffle())) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");

return E->VectorizedValue;

}

@@ -10126,13 +11037,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return Vec;

}

- auto FinalShuffle = [&](Value *V, const TreeEntry *E) {

+ auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,

+ bool IsSigned) {

+ if (V->getType() != VecTy)

+ V = Builder.CreateIntCast(V, VecTy, IsSigned);

ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);

if (E->getOpcode() == Instruction::Store) {

ArrayRef<int> Mask =

ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),

E->ReorderIndices.size());

ShuffleBuilder.add(V, Mask);

+ } else if (E->State == TreeEntry::PossibleStridedVectorize) {

+ ShuffleBuilder.addOrdered(V, std::nullopt);

} else {

ShuffleBuilder.addOrdered(V, E->ReorderIndices);

}

@@ -10140,7 +11056,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

};

assert((E->State == TreeEntry::Vectorize ||

- E->State == TreeEntry::ScatterVectorize) &&

+ E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

"Unhandled state");

unsigned ShuffleOrOp =

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

@@ -10150,6 +11067,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

ScalarTy = Store->getValueOperand()->getType();

else if (auto *IE = dyn_cast<InsertElementInst>(VL0))

ScalarTy = IE->getOperand(1)->getType();

+ bool IsSigned = false;

+ auto It = MinBWs.find(E);

+ if (It != MinBWs.end()) {

+ ScalarTy = IntegerType::get(F->getContext(), It->second.first);

+ IsSigned = It->second.second;

+ }

auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

switch (ShuffleOrOp) {

case Instruction::PHI: {

@@ -10157,32 +11080,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

E != VectorizableTree.front().get() ||

!E->UserTreeIndices.empty()) &&

"PHI reordering is free.");

+ if (PostponedPHIs && E->VectorizedValue)

+ return E->VectorizedValue;

auto *PH = cast<PHINode>(VL0);

- Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

- Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

- Value *V = NewPhi;

- // Adjust insertion point once all PHI's have been generated.

- Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());

+ Builder.SetInsertPoint(PH->getParent(),

+ PH->getParent()->getFirstNonPHIIt());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

+ if (PostponedPHIs || !E->VectorizedValue) {

+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

+ E->PHI = NewPhi;

+ Value *V = NewPhi;

+ // Adjust insertion point once all PHI's have been generated.

+ Builder.SetInsertPoint(PH->getParent(),

+ PH->getParent()->getFirstInsertionPt());

+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

- E->VectorizedValue = V;

+ E->VectorizedValue = V;

+ if (PostponedPHIs)

+ return V;

+ }

+ PHINode *NewPhi = cast<PHINode>(E->PHI);

+ // If phi node is fully emitted - exit.

+ if (NewPhi->getNumIncomingValues() != 0)

+ return NewPhi;

// PHINodes may have multiple entries from the same block. We want to

// visit every block once.

SmallPtrSet<BasicBlock *, 4> VisitedBBs;

- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {

ValueList Operands;

- BasicBlock *IBB = PH->getIncomingBlock(i);

+ BasicBlock *IBB = PH->getIncomingBlock(I);

// Stop emission if all incoming values are generated.

if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

- return V;

+ return NewPhi;

}

if (!VisitedBBs.insert(IBB).second) {

@@ -10192,37 +11128,54 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Builder.SetInsertPoint(IBB->getTerminator());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- Value *Vec = vectorizeOperand(E, i);

+ Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);

+ if (VecTy != Vec->getType()) {

+ assert(MinBWs.contains(getOperandEntry(E, I)) &&

+ "Expected item in MinBWs.");

+ Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);

+ }

NewPhi->addIncoming(Vec, IBB);

}

assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

"Invalid number of incoming values");

- return V;

+ return NewPhi;

}

case Instruction::ExtractElement: {

Value *V = E->getSingleOperand(0);

setInsertPointAfterBundle(E);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

return V;

}

case Instruction::ExtractValue: {

auto *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI);

- auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

+ Value *Ptr = LI->getPointerOperand();

LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());

Value *NewV = propagateMetadata(V, E->Scalars);

- NewV = FinalShuffle(NewV, E);

+ NewV = FinalShuffle(NewV, E, VecTy, IsSigned);

E->VectorizedValue = NewV;

return NewV;

}

case Instruction::InsertElement: {

assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");

Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

- Value *V = vectorizeOperand(E, 1);

+ Value *V = vectorizeOperand(E, 1, PostponedPHIs);

+ ArrayRef<Value *> Op = E->getOperand(1);

+ Type *ScalarTy = Op.front()->getType();

+ if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {

+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");

+ std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));

+ assert(Res.first > 0 && "Expected item in MinBWs.");

+ V = Builder.CreateIntCast(

+ V,

+ FixedVectorType::get(

+ ScalarTy,

+ cast<FixedVectorType>(V->getType())->getNumElements()),

+ Res.second);

+ }

// Create InsertVector shuffle if necessary

auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

@@ -10255,7 +11208,57 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Mask[InsertIdx - Offset] = I;

}

if (!IsIdentity || NumElts != NumScalars) {

- V = Builder.CreateShuffleVector(V, Mask);

+ Value *V2 = nullptr;

+ bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);

+ SmallVector<int> InsertMask(Mask);

+ if (NumElts != NumScalars && Offset == 0) {

+ // Follow all insert element instructions from the current buildvector

+ // sequence.

+ InsertElementInst *Ins = cast<InsertElementInst>(VL0);

+ do {

+ std::optional<unsigned> InsertIdx = getInsertIndex(Ins);

+ if (!InsertIdx)

+ break;

+ if (InsertMask[*InsertIdx] == PoisonMaskElem)

+ InsertMask[*InsertIdx] = *InsertIdx;

+ if (!Ins->hasOneUse())

+ break;

+ Ins = dyn_cast_or_null<InsertElementInst>(

+ Ins->getUniqueUndroppableUser());

+ } while (Ins);

+ SmallBitVector UseMask =

+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

+ SmallBitVector IsFirstPoison =

+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

+ SmallBitVector IsFirstUndef =

+ isUndefVector(FirstInsert->getOperand(0), UseMask);

+ if (!IsFirstPoison.all()) {

+ unsigned Idx = 0;

+ for (unsigned I = 0; I < NumElts; I++) {

+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&

+ IsFirstUndef.test(I)) {

+ if (IsVNonPoisonous) {

+ InsertMask[I] = I < NumScalars ? I : 0;

+ continue;

+ }

+ if (!V2)

+ V2 = UndefValue::get(V->getType());

+ if (Idx >= NumScalars)

+ Idx = NumScalars - 1;

+ InsertMask[I] = NumScalars + Idx;

+ ++Idx;

+ } else if (InsertMask[I] != PoisonMaskElem &&

+ Mask[I] == PoisonMaskElem) {

+ InsertMask[I] = PoisonMaskElem;

+ }

+ } else {

+ InsertMask = Mask;

+ }

+ if (!V2)

+ V2 = PoisonValue::get(V->getType());

+ V = Builder.CreateShuffleVector(V, V2, InsertMask);

if (auto *I = dyn_cast<Instruction>(V)) {

GatherShuffleExtractSeq.insert(I);

CSEBlocks.insert(I->getParent());

@@ -10274,15 +11277,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&

NumElts != NumScalars) {

if (IsFirstUndef.all()) {

- if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {

- SmallBitVector IsFirstPoison =

- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

- if (!IsFirstPoison.all()) {

- for (unsigned I = 0; I < NumElts; I++) {

- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

- InsertMask[I] = I + NumElts;

+ if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {

+ SmallBitVector IsFirstPoison =

+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

+ if (!IsFirstPoison.all()) {

+ for (unsigned I = 0; I < NumElts; I++) {

+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

+ InsertMask[I] = I + NumElts;

+ }

}

- }

V = Builder.CreateShuffleVector(

IsFirstPoison.all() ? PoisonValue::get(V->getType())

@@ -10330,15 +11333,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::BitCast: {

setInsertPointAfterBundle(E);

- Value *InVec = vectorizeOperand(E, 0);

+ Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

auto *CI = cast<CastInst>(VL0);

- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

- V = FinalShuffle(V, E);

+ Instruction::CastOps VecOpcode = CI->getOpcode();

+ Type *SrcScalarTy = VL0->getOperand(0)->getType();

+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

+ if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&

+ (SrcIt != MinBWs.end() || It != MinBWs.end())) {

+ // Check if the values are candidates to demote.

+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);

+ if (SrcIt != MinBWs.end())

+ SrcBWSz = SrcIt->second.first;

+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

+ if (BWSz == SrcBWSz) {

+ VecOpcode = Instruction::BitCast;

+ } else if (BWSz < SrcBWSz) {

+ VecOpcode = Instruction::Trunc;

+ } else if (It != MinBWs.end()) {

+ assert(BWSz > SrcBWSz && "Invalid cast!");

+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

+ }

+ Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)

+ ? InVec

+ : Builder.CreateCast(VecOpcode, InVec, VecTy);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10348,21 +11372,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::ICmp: {

setInsertPointAfterBundle(E);

- Value *L = vectorizeOperand(E, 0);

+ Value *L = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *R = vectorizeOperand(E, 1);

+ Value *R = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (L->getType() != R->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ L = Builder.CreateIntCast(L, VecTy, IsSigned);

+ R = Builder.CreateIntCast(R, VecTy, IsSigned);

+ }

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

Value *V = Builder.CreateCmp(P0, L, R);

propagateIRFlags(V, E->Scalars, VL0);

- V = FinalShuffle(V, E);

+ // Do not cast for cmps.

+ VecTy = cast<FixedVectorType>(V->getType());

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10371,24 +11404,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Select: {

setInsertPointAfterBundle(E);

- Value *Cond = vectorizeOperand(E, 0);

+ Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *True = vectorizeOperand(E, 1);

+ Value *True = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *False = vectorizeOperand(E, 2);

+ Value *False = vectorizeOperand(E, 2, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (True->getType() != False->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 1)) ||

+ MinBWs.contains(getOperandEntry(E, 2))) &&

+ "Expected item in MinBWs.");

+ True = Builder.CreateIntCast(True, VecTy, IsSigned);

+ False = Builder.CreateIntCast(False, VecTy, IsSigned);

+ }

Value *V = Builder.CreateSelect(Cond, True, False);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10397,7 +11437,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::FNeg: {

setInsertPointAfterBundle(E);

- Value *Op = vectorizeOperand(E, 0);

+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

@@ -10410,7 +11450,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10437,16 +11477,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Xor: {

setInsertPointAfterBundle(E);

- Value *LHS = vectorizeOperand(E, 0);

+ Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *RHS = vectorizeOperand(E, 1);

+ Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (LHS->getType() != RHS->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);

+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);

+ }

Value *V = Builder.CreateBinOp(

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

@@ -10455,7 +11502,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10476,14 +11523,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// The pointer operand uses an in-tree scalar so we add the new

// LoadInst to ExternalUses list to make sure that an extract will

// be generated in the future.

- if (TreeEntry *Entry = getTreeEntry(PO)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(PO);

- ExternalUses.emplace_back(PO, NewLI, FoundLane);

+ if (isa<Instruction>(PO)) {

+ if (TreeEntry *Entry = getTreeEntry(PO)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(PO);

+ ExternalUses.emplace_back(PO, NewLI, FoundLane);

+ }

}

} else {

- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");

- Value *VecPtr = vectorizeOperand(E, 0);

+ assert((E->State == TreeEntry::ScatterVectorize ||

+ E->State == TreeEntry::PossibleStridedVectorize) &&

+ "Unhandled state");

+ Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10497,35 +11548,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = propagateMetadata(NewLI, E->Scalars);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

return V;

}

case Instruction::Store: {

auto *SI = cast<StoreInst>(VL0);

- unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E);

- Value *VecValue = vectorizeOperand(E, 0);

- VecValue = FinalShuffle(VecValue, E);

+ Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);

+ VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);

- Value *ScalarPtr = SI->getPointerOperand();

- Value *VecPtr = Builder.CreateBitCast(

- ScalarPtr, VecValue->getType()->getPointerTo(AS));

+ Value *Ptr = SI->getPointerOperand();

StoreInst *ST =

- Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());

+ Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());

- // The pointer operand uses an in-tree scalar, so add the new BitCast or

- // StoreInst to ExternalUses to make sure that an extract will be

- // generated in the future.

- if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {

- // Find which lane we need to extract.

- unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);

- ExternalUses.push_back(ExternalUser(

- ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,

- FoundLane));

+ // The pointer operand uses an in-tree scalar, so add the new StoreInst to

+ // ExternalUses to make sure that an extract will be generated in the

+ // future.

+ if (isa<Instruction>(Ptr)) {

+ if (TreeEntry *Entry = getTreeEntry(Ptr)) {

+ // Find which lane we need to extract.

+ unsigned FoundLane = Entry->findLaneForValue(Ptr);

+ ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));

+ }

}

Value *V = propagateMetadata(ST, E->Scalars);

@@ -10538,7 +11586,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

auto *GEP0 = cast<GetElementPtrInst>(VL0);

setInsertPointAfterBundle(E);

- Value *Op0 = vectorizeOperand(E, 0);

+ Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10546,7 +11594,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallVector<Value *> OpVecs;

for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {

- Value *OpVec = vectorizeOperand(E, J);

+ Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

@@ -10564,7 +11612,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

V = propagateMetadata(I, GEPs);

}

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10586,41 +11634,42 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

VecCallCosts.first <= VecCallCosts.second;

Value *ScalarArg = nullptr;

- std::vector<Value *> OpVecs;

+ SmallVector<Value *> OpVecs;

SmallVector<Type *, 2> TysForDecl;

// Add return type if intrinsic is overloaded on it.

if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))

TysForDecl.push_back(

FixedVectorType::get(CI->getType(), E->Scalars.size()));

- for (int j = 0, e = CI->arg_size(); j < e; ++j) {

+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {

ValueList OpVL;

// Some intrinsics have scalar arguments. This argument should not be

// vectorized.

- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {

+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {

CallInst *CEI = cast<CallInst>(VL0);

- ScalarArg = CEI->getArgOperand(j);

- OpVecs.push_back(CEI->getArgOperand(j));

- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

+ ScalarArg = CEI->getArgOperand(I);

+ OpVecs.push_back(CEI->getArgOperand(I));

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))

TysForDecl.push_back(ScalarArg->getType());

continue;

}

- Value *OpVec = vectorizeOperand(E, j);

+ Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");

+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");

OpVecs.push_back(OpVec);

- if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

+ if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))

TysForDecl.push_back(OpVec->getType());

}

Function *CF;

if (!UseIntrinsic) {

VFShape Shape =

- VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

- VecTy->getNumElements())),

+ VFShape::get(CI->getFunctionType(),

+ ElementCount::getFixed(

+ static_cast<unsigned>(VecTy->getNumElements())),

false /*HasGlobalPred*/);

CF = VFDatabase(*CI).getVectorizedFunction(Shape);

} else {

@@ -10634,7 +11683,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// The scalar argument uses an in-tree scalar so we add the new vectorized

// call to ExternalUses list to make sure that an extract will be

// generated in the future.

- if (ScalarArg) {

+ if (isa_and_present<Instruction>(ScalarArg)) {

if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {

// Find which lane we need to extract.

unsigned FoundLane = Entry->findLaneForValue(ScalarArg);

@@ -10644,7 +11693,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

propagateIRFlags(V, E->Scalars, VL0);

- V = FinalShuffle(V, E);

+ V = FinalShuffle(V, E, VecTy, IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10662,20 +11711,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *LHS = nullptr, *RHS = nullptr;

if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

setInsertPointAfterBundle(E);

- LHS = vectorizeOperand(E, 0);

+ LHS = vectorizeOperand(E, 0, PostponedPHIs);

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- RHS = vectorizeOperand(E, 1);

+ RHS = vectorizeOperand(E, 1, PostponedPHIs);

} else {

setInsertPointAfterBundle(E);

- LHS = vectorizeOperand(E, 0);

+ LHS = vectorizeOperand(E, 0, PostponedPHIs);

}

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

+ if (LHS && RHS && LHS->getType() != RHS->getType()) {

+ assert((MinBWs.contains(getOperandEntry(E, 0)) ||

+ MinBWs.contains(getOperandEntry(E, 1))) &&

+ "Expected item in MinBWs.");

+ LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);

+ RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);

+ }

Value *V0, *V1;

if (Instruction::isBinaryOp(E->getOpcode())) {

@@ -10708,8 +11764,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

// each vector operation.

ValueList OpScalars, AltScalars;

SmallVector<int> Mask;

- buildShuffleEntryMask(

- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

+ E->buildAltOpShuffleMask(

[E, this](Instruction *I) {

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

@@ -10727,6 +11782,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

CSEBlocks.insert(I->getParent());

}

+ if (V->getType() != VecTy && !isa<CmpInst>(VL0))

+ V = Builder.CreateIntCast(

+ V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);

E->VectorizedValue = V;

++NumVectorInstructions;

@@ -10767,9 +11825,19 @@ Value *BoUpSLP::vectorizeTree(

// need to rebuild it.

EntryToLastInstruction.clear();

- Builder.SetInsertPoint(ReductionRoot ? ReductionRoot

- : &F->getEntryBlock().front());

- auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

+ if (ReductionRoot)

+ Builder.SetInsertPoint(ReductionRoot->getParent(),

+ ReductionRoot->getIterator());

+ else

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

+ // Postpone emission of PHIs operands to avoid cyclic dependencies issues.

+ (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);

+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)

+ if (TE->State == TreeEntry::Vectorize &&

+ TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&

+ TE->VectorizedValue)

+ (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);

// Run through the list of postponed gathers and emit them, replacing the temp

// emitted allocas with actual vector instructions.

ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

@@ -10786,9 +11854,32 @@ Value *BoUpSLP::vectorizeTree(

TE->VectorizedValue = nullptr;

auto *UserI =

cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);

- Builder.SetInsertPoint(PrevVec);

+ // If user is a PHI node, its vector code have to be inserted right before

+ // block terminator. Since the node was delayed, there were some unresolved

+ // dependencies at the moment when stab instruction was emitted. In a case

+ // when any of these dependencies turn out an operand of another PHI, coming

+ // from this same block, position of a stab instruction will become invalid.

+ // The is because source vector that supposed to feed this gather node was

+ // inserted at the end of the block [after stab instruction]. So we need

+ // to adjust insertion point again to the end of block.

+ if (isa<PHINode>(UserI)) {

+ // Insert before all users.

+ Instruction *InsertPt = PrevVec->getParent()->getTerminator();

+ for (User *U : PrevVec->users()) {

+ if (U == UserI)

+ continue;

+ auto *UI = dyn_cast<Instruction>(U);

+ if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())

+ continue;

+ if (UI->comesBefore(InsertPt))

+ InsertPt = UI;

+ }

+ Builder.SetInsertPoint(InsertPt);

+ } else {

+ Builder.SetInsertPoint(PrevVec);

+ }

Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

- Value *Vec = vectorizeTree(TE);

+ Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);

PrevVec->replaceAllUsesWith(Vec);

PostponedValues.try_emplace(Vec).first->second.push_back(TE);

// Replace the stub vector node, if it was used before for one of the

@@ -10801,26 +11892,6 @@ Value *BoUpSLP::vectorizeTree(

eraseInstruction(PrevVec);

}

- // If the vectorized tree can be rewritten in a smaller type, we truncate the

- // vectorized root. InstCombine will then rewrite the entire expression. We

- // sign extend the extracted values below.

- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

- if (MinBWs.count(ScalarRoot)) {

- if (auto *I = dyn_cast<Instruction>(VectorRoot)) {

- // If current instr is a phi and not the last phi, insert it after the

- // last phi node.

- if (isa<PHINode>(I))

- Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());

- else

- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));

- }

- auto BundleWidth = VectorizableTree[0]->Scalars.size();

- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

- auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);

- auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);

- VectorizableTree[0]->VectorizedValue = Trunc;

- }

LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()

<< " values .\n");

@@ -10830,6 +11901,8 @@ Value *BoUpSLP::vectorizeTree(

// Maps extract Scalar to the corresponding extractelement instruction in the

// basic block. Only one extractelement per block should be emitted.

DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;

+ SmallDenseSet<Value *, 4> UsedInserts;

+ DenseMap<Value *, Value *> VectorCasts;

// Extract all of the elements with the external uses.

for (const auto &ExternalUse : ExternalUses) {

Value *Scalar = ExternalUse.Scalar;

@@ -10864,7 +11937,8 @@ Value *BoUpSLP::vectorizeTree(

Instruction *I = EEIt->second;

if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&

Builder.GetInsertPoint()->comesBefore(I))

- I->moveBefore(&*Builder.GetInsertPoint());

+ I->moveBefore(*Builder.GetInsertPoint()->getParent(),

+ Builder.GetInsertPoint());

Ex = I;

}

@@ -10887,11 +11961,10 @@ Value *BoUpSLP::vectorizeTree(

}

// If necessary, sign-extend or zero-extend ScalarRoot

// to the larger type.

- if (!MinBWs.count(ScalarRoot))

- return Ex;

- if (MinBWs[ScalarRoot].second)

- return Builder.CreateSExt(Ex, Scalar->getType());

- return Builder.CreateZExt(Ex, Scalar->getType());

+ if (Scalar->getType() != Ex->getType())

+ return Builder.CreateIntCast(Ex, Scalar->getType(),

+ MinBWs.find(E)->second.second);

+ return Ex;

}

assert(isa<FixedVectorType>(Scalar->getType()) &&

isa<InsertElementInst>(Scalar) &&

@@ -10909,12 +11982,13 @@ Value *BoUpSLP::vectorizeTree(

"ExternallyUsedValues map");

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

if (auto *PHI = dyn_cast<PHINode>(VecI))

- Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());

+ Builder.SetInsertPoint(PHI->getParent(),

+ PHI->getParent()->getFirstNonPHIIt());

else

Builder.SetInsertPoint(VecI->getParent(),

std::next(VecI->getIterator()));

} else {

- Builder.SetInsertPoint(&F->getEntryBlock().front());

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

}

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

// Required to update internally referenced instructions.

@@ -10927,12 +12001,26 @@ Value *BoUpSLP::vectorizeTree(

// Skip if the scalar is another vector op or Vec is not an instruction.

if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

+ if (!UsedInserts.insert(VU).second)

+ continue;

+ // Need to use original vector, if the root is truncated.

+ auto BWIt = MinBWs.find(E);

+ if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {

+ auto VecIt = VectorCasts.find(Scalar);

+ if (VecIt == VectorCasts.end()) {

+ IRBuilder<>::InsertPointGuard Guard(Builder);

+ if (auto *IVec = dyn_cast<Instruction>(Vec))

+ Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());

+ Vec = Builder.CreateIntCast(Vec, VU->getType(),

+ BWIt->second.second);

+ VectorCasts.try_emplace(Scalar, Vec);

+ } else {

+ Vec = VecIt->second;

+ }

std::optional<unsigned> InsertIdx = getInsertIndex(VU);

if (InsertIdx) {

- // Need to use original vector, if the root is truncated.

- if (MinBWs.count(Scalar) &&

- VectorizableTree[0]->VectorizedValue == Vec)

- Vec = VectorRoot;

auto *It =

find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {

// Checks if 2 insertelements are from the same buildvector.

@@ -10992,18 +12080,18 @@ Value *BoUpSLP::vectorizeTree(

// Find the insertion point for the extractelement lane.

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

if (PHINode *PH = dyn_cast<PHINode>(User)) {

- for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {

- if (PH->getIncomingValue(i) == Scalar) {

+ for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {

+ if (PH->getIncomingValue(I) == Scalar) {

Instruction *IncomingTerminator =

- PH->getIncomingBlock(i)->getTerminator();

+ PH->getIncomingBlock(I)->getTerminator();

if (isa<CatchSwitchInst>(IncomingTerminator)) {

Builder.SetInsertPoint(VecI->getParent(),

std::next(VecI->getIterator()));

} else {

- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());

+ Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());

}

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

- PH->setOperand(i, NewInst);

+ PH->setOperand(I, NewInst);

}

} else {

@@ -11012,7 +12100,7 @@ Value *BoUpSLP::vectorizeTree(

User->replaceUsesOfWith(Scalar, NewInst);

}

} else {

- Builder.SetInsertPoint(&F->getEntryBlock().front());

+ Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

User->replaceUsesOfWith(Scalar, NewInst);

}

@@ -11085,7 +12173,7 @@ Value *BoUpSLP::vectorizeTree(

// non-resizing mask.

if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

->getNumElements() ||

- !ShuffleVectorInst::isIdentityMask(Mask))

+ !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

return CreateShuffle(Vals.front(), nullptr, Mask);

return Vals.front();

}

@@ -11676,7 +12764,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

}

- auto makeControlDependent = [&](Instruction *I) {

+ auto MakeControlDependent = [&](Instruction *I) {

auto *DepDest = getScheduleData(I);

assert(DepDest && "must be in schedule window");

DepDest->ControlDependencies.push_back(BundleMember);

@@ -11698,7 +12786,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

if (!isGuaranteedToTransferExecutionToSuccessor(I))

// Everything past here must be control dependent on I.

@@ -11724,7 +12812,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

}

@@ -11742,7 +12830,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

continue;

// Add the dependency

- makeControlDependent(I);

+ MakeControlDependent(I);

break;

}

@@ -11757,7 +12845,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

"NextLoadStore list for non memory effecting bundle?");

MemoryLocation SrcLoc = getLocation(SrcInst);

bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

- unsigned numAliased = 0;

+ unsigned NumAliased = 0;

unsigned DistToSrc = 1;

for (; DepDest; DepDest = DepDest->NextLoadStore) {

@@ -11772,13 +12860,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

// check this limit even between two read-only instructions.

if (DistToSrc >= MaxMemDepDistance ||

((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

- (numAliased >= AliasedCheckLimit ||

+ (NumAliased >= AliasedCheckLimit ||

SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

// We increment the counter only if the locations are aliased

// (instead of counting all alias checks). This gives a better

// balance between reduced runtime and accurate dependencies.

- numAliased++;

+ NumAliased++;

DepDest->MemoryDependencies.push_back(BundleMember);

BundleMember->Dependencies++;

@@ -11880,20 +12968,20 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

// Do the "real" scheduling.

while (!ReadyInsts.empty()) {

- ScheduleData *picked = *ReadyInsts.begin();

+ ScheduleData *Picked = *ReadyInsts.begin();

ReadyInsts.erase(ReadyInsts.begin());

// Move the scheduled instruction(s) to their dedicated places, if not

// there yet.

- for (ScheduleData *BundleMember = picked; BundleMember;

+ for (ScheduleData *BundleMember = Picked; BundleMember;

BundleMember = BundleMember->NextInBundle) {

- Instruction *pickedInst = BundleMember->Inst;

- if (pickedInst->getNextNode() != LastScheduledInst)

- pickedInst->moveBefore(LastScheduledInst);

- LastScheduledInst = pickedInst;

+ Instruction *PickedInst = BundleMember->Inst;

+ if (PickedInst->getNextNode() != LastScheduledInst)

+ PickedInst->moveBefore(LastScheduledInst);

+ LastScheduledInst = PickedInst;

}

- BS->schedule(picked, ReadyInsts);

+ BS->schedule(Picked, ReadyInsts);

}

// Check that we didn't break any of our invariants.

@@ -11994,21 +13082,22 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {

// Determine if a value V in a vectorizable expression Expr can be demoted to a

// smaller type with a truncation. We collect the values that will be demoted

// in ToDemote and additional roots that require investigating in Roots.

-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

- SmallVectorImpl<Value *> &ToDemote,

- SmallVectorImpl<Value *> &Roots) {

+bool BoUpSLP::collectValuesToDemote(

+ Value *V, SmallVectorImpl<Value *> &ToDemote,

+ DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,

+ SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {

// We can always demote constants.

- if (isa<Constant>(V)) {

- ToDemote.push_back(V);

+ if (isa<Constant>(V))

return true;

- }

- // If the value is not an instruction in the expression with only one use, it

- // cannot be demoted.

+ // If the value is not a vectorized instruction in the expression with only

+ // one use, it cannot be demoted.

auto *I = dyn_cast<Instruction>(V);

- if (!I || !I->hasOneUse() || !Expr.count(I))

+ if (!I || !I->hasOneUse() || !getTreeEntry(I) || !Visited.insert(I).second)

return false;

+ unsigned Start = 0;

+ unsigned End = I->getNumOperands();

switch (I->getOpcode()) {

// We can always demote truncations and extensions. Since truncations can

@@ -12030,16 +13119,21 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor:

- if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||

- !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,

+ Visited) ||

+ !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,

+ Visited))

return false;

break;

// We can demote selects if we can demote their true and false values.

case Instruction::Select: {

+ Start = 1;

SelectInst *SI = cast<SelectInst>(I);

- if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||

- !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,

+ Roots, Visited) ||

+ !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,

+ Roots, Visited))

return false;

break;

}

@@ -12049,7 +13143,8 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

case Instruction::PHI: {

PHINode *PN = cast<PHINode>(I);

for (Value *IncValue : PN->incoming_values())

- if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))

+ if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,

+ Visited))

return false;

break;

}

@@ -12059,6 +13154,10 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

return false;

}

+ // Gather demoted constant operands.

+ for (unsigned Idx : seq<unsigned>(Start, End))

+ if (isa<Constant>(I->getOperand(Idx)))

+ DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);

// Record the value that we can demote.

ToDemote.push_back(V);

return true;

@@ -12076,44 +13175,26 @@ void BoUpSLP::computeMinimumValueSizes() {

if (!TreeRootIT)

return;

- // If the expression is not rooted by a store, these roots should have

- // external uses. We will rely on InstCombine to rewrite the expression in

- // the narrower type. However, InstCombine only rewrites single-use values.

- // This means that if a tree entry other than a root is used externally, it

- // must have multiple uses and InstCombine will not rewrite it. The code

- // below ensures that only the roots are used externally.

- SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());

- for (auto &EU : ExternalUses)

- if (!Expr.erase(EU.Scalar))

- return;

- if (!Expr.empty())

+ // Ensure the roots of the vectorizable tree don't form a cycle.

+ if (!VectorizableTree.front()->UserTreeIndices.empty())

return;

- // Collect the scalar values of the vectorizable expression. We will use this

- // context to determine which values can be demoted. If we see a truncation,

- // we mark it as seeding another demotion.

- for (auto &EntryPtr : VectorizableTree)

- Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());

- // Ensure the roots of the vectorizable tree don't form a cycle. They must

- // have a single external user that is not in the vectorizable tree.

- for (auto *Root : TreeRoot)

- if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))

- return;

// Conservatively determine if we can actually truncate the roots of the

// expression. Collect the values that can be demoted in ToDemote and

// additional roots that require investigating in Roots.

SmallVector<Value *, 32> ToDemote;

+ DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;

SmallVector<Value *, 4> Roots;

- for (auto *Root : TreeRoot)

- if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))

+ for (auto *Root : TreeRoot) {

+ DenseSet<Value *> Visited;

+ if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))

return;

+ }

// The maximum bit width required to represent all the values that can be

// demoted without loss of precision. It would be safe to truncate the roots

// of the expression to this width.

- auto MaxBitWidth = 8u;

+ auto MaxBitWidth = 1u;

// We first check if all the bits of the roots are demanded. If they're not,

// we can truncate the roots to this narrower type.

@@ -12138,9 +13219,9 @@ void BoUpSLP::computeMinimumValueSizes() {

// maximum bit width required to store the scalar by using ValueTracking to

// compute the number of high-order bits we can truncate.

if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&

- llvm::all_of(TreeRoot, [](Value *R) {

- assert(R->hasOneUse() && "Root should have only one use!");

- return isa<GetElementPtrInst>(R->user_back());

+ all_of(TreeRoot, [](Value *V) {

+ return all_of(V->users(),

+ [](User *U) { return isa<GetElementPtrInst>(U); });

})) {

MaxBitWidth = 8u;

@@ -12189,12 +13270,39 @@ void BoUpSLP::computeMinimumValueSizes() {

// If we can truncate the root, we must collect additional values that might

// be demoted as a result. That is, those seeded by truncations we will

// modify.

- while (!Roots.empty())

- collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

+ while (!Roots.empty()) {

+ DenseSet<Value *> Visited;

+ collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,

+ Visited);

+ }

// Finally, map the values we can demote to the maximum bit with we computed.

- for (auto *Scalar : ToDemote)

- MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);

+ for (auto *Scalar : ToDemote) {

+ auto *TE = getTreeEntry(Scalar);

+ assert(TE && "Expected vectorized scalar.");

+ if (MinBWs.contains(TE))

+ continue;

+ bool IsSigned = any_of(TE->Scalars, [&](Value *R) {

+ KnownBits Known = computeKnownBits(R, *DL);

+ return !Known.isNonNegative();

+ });

+ MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);

+ const auto *I = cast<Instruction>(Scalar);

+ auto DCIt = DemotedConsts.find(I);

+ if (DCIt != DemotedConsts.end()) {

+ for (unsigned Idx : DCIt->getSecond()) {

+ // Check that all instructions operands are demoted.

+ if (all_of(TE->Scalars, [&](Value *V) {

+ auto SIt = DemotedConsts.find(cast<Instruction>(V));

+ return SIt != DemotedConsts.end() &&

+ is_contained(SIt->getSecond(), Idx);

+ })) {

+ const TreeEntry *CTE = getOperandEntry(TE, Idx);

+ MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);

+ }

}

PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

@@ -12348,139 +13456,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

BoUpSLP::ValueSet VectorizedStores;

bool Changed = false;

- int E = Stores.size();

- SmallBitVector Tails(E, false);

- int MaxIter = MaxStoreLookup.getValue();

- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(

- E, std::make_pair(E, INT_MAX));

- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));

- int IterCnt;

- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,

- &CheckedPairs,

- &ConsecutiveChain](int K, int Idx) {

- if (IterCnt >= MaxIter)

- return true;

- if (CheckedPairs[Idx].test(K))

- return ConsecutiveChain[K].second == 1 &&

- ConsecutiveChain[K].first == Idx;

- ++IterCnt;

- CheckedPairs[Idx].set(K);

- CheckedPairs[K].set(Idx);

- std::optional<int> Diff = getPointersDiff(

- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),

- Stores[Idx]->getValueOperand()->getType(),

- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);

- if (!Diff || *Diff == 0)

- return false;

- int Val = *Diff;

- if (Val < 0) {

- if (ConsecutiveChain[Idx].second > -Val) {

- Tails.set(K);

- ConsecutiveChain[Idx] = std::make_pair(K, -Val);

- }

- return false;

+ // Stores the pair of stores (first_store, last_store) in a range, that were

+ // already tried to be vectorized. Allows to skip the store ranges that were

+ // already tried to be vectorized but the attempts were unsuccessful.

+ DenseSet<std::pair<Value *, Value *>> TriedSequences;

+ struct StoreDistCompare {

+ bool operator()(const std::pair<unsigned, int> &Op1,

+ const std::pair<unsigned, int> &Op2) const {

+ return Op1.second < Op2.second;

}

- if (ConsecutiveChain[K].second <= Val)

- return false;

- Tails.set(Idx);

- ConsecutiveChain[K] = std::make_pair(Idx, Val);

- return Val == 1;

};

- // Do a quadratic search on all of the given stores in reverse order and find

- // all of the pairs of stores that follow each other.

- for (int Idx = E - 1; Idx >= 0; --Idx) {

- // If a store has multiple consecutive store candidates, search according

- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

- // This is because usually pairing with immediate succeeding or preceding

- // candidate create the best chance to find slp vectorization opportunity.

- const int MaxLookDepth = std::max(E - Idx, Idx + 1);

- IterCnt = 0;

- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)

- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

- break;

- }

- // Tracks if we tried to vectorize stores starting from the given tail

- // already.

- SmallBitVector TriedTails(E, false);

- // For stores that start but don't end a link in the chain:

- for (int Cnt = E; Cnt > 0; --Cnt) {

- int I = Cnt - 1;

- if (ConsecutiveChain[I].first == E || Tails.test(I))

- continue;

- // We found a store instr that starts a chain. Now follow the chain and try

- // to vectorize it.

+ // A set of pairs (index of store in Stores array ref, Distance of the store

+ // address relative to base store address in units).

+ using StoreIndexToDistSet =

+ std::set<std::pair<unsigned, int>, StoreDistCompare>;

+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {

+ int PrevDist = -1;

BoUpSLP::ValueList Operands;

// Collect the chain into a list.

- while (I != E && !VectorizedStores.count(Stores[I])) {

- Operands.push_back(Stores[I]);

- Tails.set(I);

- if (ConsecutiveChain[I].second != 1) {

- // Mark the new end in the chain and go back, if required. It might be

- // required if the original stores come in reversed order, for example.

- if (ConsecutiveChain[I].first != E &&

- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&

- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {

- TriedTails.set(I);

- Tails.reset(ConsecutiveChain[I].first);

- if (Cnt < ConsecutiveChain[I].first + 2)

- Cnt = ConsecutiveChain[I].first + 2;

+ for (auto [Idx, Data] : enumerate(Set)) {

+ if (Operands.empty() || Data.second - PrevDist == 1) {

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

+ if (Idx != Set.size() - 1)

+ continue;

+ }

+ if (Operands.size() <= 1) {

+ Operands.clear();

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

+ continue;

+ }

+ unsigned MaxVecRegSize = R.getMaxVecRegSize();

+ unsigned EltSize = R.getVectorElementSize(Operands[0]);

+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);

+ unsigned MaxVF =

+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);

+ auto *Store = cast<StoreInst>(Operands[0]);

+ Type *StoreTy = Store->getValueOperand()->getType();

+ Type *ValueTy = StoreTy;

+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

+ ValueTy = Trunc->getSrcTy();

+ unsigned MinVF = TTI->getStoreMinimumVF(

+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

+ if (MaxVF <= MinVF) {

+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF

+ << ") <= "

+ << "MinVF (" << MinVF << ")\n");

+ }

+ // FIXME: Is division-by-2 the correct step? Should we assert that the

+ // register size is a power-of-2?

+ unsigned StartIdx = 0;

+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {

+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);

+ assert(

+ all_of(

+ Slice,

+ [&](Value *V) {

+ return cast<StoreInst>(V)->getValueOperand()->getType() ==

+ cast<StoreInst>(Slice.front())

+ ->getValueOperand()

+ ->getType();

+ }) &&

+ "Expected all operands of same type.");

+ if (!VectorizedStores.count(Slice.front()) &&

+ !VectorizedStores.count(Slice.back()) &&

+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))

+ .second &&

+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

+ // Mark the vectorized stores so that we don't vectorize them again.

+ VectorizedStores.insert(Slice.begin(), Slice.end());

+ Changed = true;

+ // If we vectorized initial block, no need to try to vectorize it

+ // again.

+ if (Cnt == StartIdx)

+ StartIdx += Size;

+ Cnt += Size;

+ continue;

+ }

+ ++Cnt;

}

- break;

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= Operands.size())

+ break;

}

- // Move to the next value in the chain.

- I = ConsecutiveChain[I].first;

+ Operands.clear();

+ Operands.push_back(Stores[Data.first]);

+ PrevDist = Data.second;

}

- assert(!Operands.empty() && "Expected non-empty list of stores.");

+ };

- unsigned MaxVecRegSize = R.getMaxVecRegSize();

- unsigned EltSize = R.getVectorElementSize(Operands[0]);

- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);

- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

- MaxElts);

- auto *Store = cast<StoreInst>(Operands[0]);

- Type *StoreTy = Store->getValueOperand()->getType();

- Type *ValueTy = StoreTy;

- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

- ValueTy = Trunc->getSrcTy();

- unsigned MinVF = TTI->getStoreMinimumVF(

- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

- if (MaxVF <= MinVF) {

- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "

- << "MinVF (" << MinVF << ")\n");

- }

- // FIXME: Is division-by-2 the correct step? Should we assert that the

- // register size is a power-of-2?

- unsigned StartIdx = 0;

- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {

- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);

- if (!VectorizedStores.count(Slice.front()) &&

- !VectorizedStores.count(Slice.back()) &&

- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

- // Mark the vectorized stores so that we don't vectorize them again.

- VectorizedStores.insert(Slice.begin(), Slice.end());

- Changed = true;

- // If we vectorized initial block, no need to try to vectorize it

- // again.

- if (Cnt == StartIdx)

- StartIdx += Size;

- Cnt += Size;

- continue;

- }

- ++Cnt;

+ // Stores pair (first: index of the store into Stores array ref, address of

+ // which taken as base, second: sorted set of pairs {index, dist}, which are

+ // indices of stores in the set and their store location distances relative to

+ // the base address).

+ // Need to store the index of the very first store separately, since the set

+ // may be reordered after the insertion and the first store may be moved. This

+ // container allows to reduce number of calls of getPointersDiff() function.

+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;

+ // Inserts the specified store SI with the given index Idx to the set of the

+ // stores. If the store with the same distance is found already - stop

+ // insertion, try to vectorize already found stores. If some stores from this

+ // sequence were not vectorized - try to vectorize them with the new store

+ // later. But this logic is applied only to the stores, that come before the

+ // previous store with the same distance.

+ // Example:

+ // 1. store x, %p

+ // 2. store y, %p+1

+ // 3. store z, %p+2

+ // 4. store a, %p

+ // 5. store b, %p+3

+ // - Scan this from the last to first store. The very first bunch of stores is

+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores

+ // vector).

+ // - The next store in the list - #1 - has the same distance from store #5 as

+ // the store #4.

+ // - Try to vectorize sequence of stores 4,2,3,5.

+ // - If all these stores are vectorized - just drop them.

+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.

+ // - Start new stores sequence.

+ // The new bunch of stores is {1, {1, 0}}.

+ // - Add the stores from previous sequence, that were not vectorized.

+ // Here we consider the stores in the reversed order, rather they are used in

+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).

+ // Store #3 can be added -> comes after store #4 with the same distance as

+ // store #1.

+ // Store #5 cannot be added - comes before store #4.

+ // This logic allows to improve the compile time, we assume that the stores

+ // after previous store with the same distance most likely have memory

+ // dependencies and no need to waste compile time to try to vectorize them.

+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.

+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {

+ for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {

+ std::optional<int> Diff = getPointersDiff(

+ Stores[Set.first]->getValueOperand()->getType(),

+ Stores[Set.first]->getPointerOperand(),

+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,

+ /*StrictCheck=*/true);

+ if (!Diff)

+ continue;

+ auto It = Set.second.find(std::make_pair(Idx, *Diff));

+ if (It == Set.second.end()) {

+ Set.second.emplace(Idx, *Diff);

+ return;

}

- // Check if the whole array was vectorized already - exit.

- if (StartIdx >= Operands.size())

- break;

+ // Try to vectorize the first found set to avoid duplicate analysis.

+ TryToVectorize(Set.second);

+ StoreIndexToDistSet PrevSet;

+ PrevSet.swap(Set.second);

+ Set.first = Idx;

+ Set.second.emplace(Idx, 0);

+ // Insert stores that followed previous match to try to vectorize them

+ // with this store.

+ unsigned StartIdx = It->first + 1;

+ SmallBitVector UsedStores(Idx - StartIdx);

+ // Distances to previously found dup store (or this store, since they

+ // store to the same addresses).

+ SmallVector<int> Dists(Idx - StartIdx, 0);

+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {

+ // Do not try to vectorize sequences, we already tried.

+ if (Pair.first <= It->first ||

+ VectorizedStores.contains(Stores[Pair.first]))

+ break;

+ unsigned BI = Pair.first - StartIdx;

+ UsedStores.set(BI);

+ Dists[BI] = Pair.second - It->second;

+ }

+ for (unsigned I = StartIdx; I < Idx; ++I) {

+ unsigned BI = I - StartIdx;

+ if (UsedStores.test(BI))

+ Set.second.emplace(I, Dists[BI]);

+ }

+ return;

}

+ auto &Res = SortedStores.emplace_back();

+ Res.first = Idx;

+ Res.second.emplace(Idx, 0);

+ };

+ StoreInst *PrevStore = Stores.front();

+ for (auto [I, SI] : enumerate(Stores)) {

+ // Check that we do not try to vectorize stores of different types.

+ if (PrevStore->getValueOperand()->getType() !=

+ SI->getValueOperand()->getType()) {

+ for (auto &Set : SortedStores)

+ TryToVectorize(Set.second);

+ SortedStores.clear();

+ PrevStore = SI;

+ }

+ FillStoresSet(I, SI);

}

+ // Final vectorization attempt.

+ for (auto &Set : SortedStores)

+ TryToVectorize(Set.second);

return Changed;

}

@@ -12507,8 +13682,10 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

// constant index, or a pointer operand that doesn't point to a scalar

// type.

else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

- auto Idx = GEP->idx_begin()->get();

- if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))

+ if (GEP->getNumIndices() != 1)

+ continue;

+ Value *Idx = GEP->idx_begin()->get();

+ if (isa<Constant>(Idx))

continue;

if (!isValidElementType(Idx->getType()))

continue;

@@ -12542,8 +13719,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

// NOTE: the following will give user internal llvm type name, which may

// not be useful.

R.getORE()->emit([&]() {

- std::string type_str;

- llvm::raw_string_ostream rso(type_str);

+ std::string TypeStr;

+ llvm::raw_string_ostream rso(TypeStr);

Ty->print(rso);

return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)

<< "Cannot SLP vectorize list: type "

@@ -12878,10 +14055,12 @@ class HorizontalReduction {

static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

Value *RHS, const Twine &Name,

const ReductionOpsListType &ReductionOps) {

- bool UseSelect = ReductionOps.size() == 2 ||

- // Logical or/and.

- (ReductionOps.size() == 1 &&

- isa<SelectInst>(ReductionOps.front().front()));

+ bool UseSelect =

+ ReductionOps.size() == 2 ||

+ // Logical or/and.

+ (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {

+ return isa<SelectInst>(V);

+ }));

assert((!UseSelect || ReductionOps.size() != 2 ||

isa<SelectInst>(ReductionOps[1][0])) &&

"Expected cmp + select pairs for reduction");

@@ -13315,12 +14494,26 @@ public:

// Update the final value in the reduction.

Builder.SetCurrentDebugLocation(

cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

+ if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||

+ (isGuaranteedNotToBePoison(Res) &&

+ !isGuaranteedNotToBePoison(VectorizedTree))) {

+ auto It = ReducedValsToOps.find(Res);

+ if (It != ReducedValsToOps.end() &&

+ any_of(It->getSecond(),

+ [](Instruction *I) { return isBoolLogicOp(I); }))

+ std::swap(VectorizedTree, Res);

+ }

return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",

ReductionOps);

}

// Initialize the final value in the reduction.

return Res;

};

+ bool AnyBoolLogicOp =

+ any_of(ReductionOps.back(), [](Value *V) {

+ return isBoolLogicOp(cast<Instruction>(V));

+ });

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

@@ -13364,10 +14557,12 @@ public:

// Check if the reduction value was not overriden by the extractelement

// instruction because of the vectorization and exclude it, if it is not

// compatible with other values.

- if (auto *Inst = dyn_cast<Instruction>(RdxVal))

- if (isVectorLikeInstWithConstOps(Inst) &&

- (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))

- continue;

+ // Also check if the instruction was folded to constant/other value.

+ auto *Inst = dyn_cast<Instruction>(RdxVal);

+ if ((Inst && isVectorLikeInstWithConstOps(Inst) &&

+ (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||

+ (S.getOpcode() && !Inst))

+ continue;

Candidates.push_back(RdxVal);

TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

}

@@ -13543,11 +14738,9 @@ public:

for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))

continue;

- for_each(ReducedVals[Cnt],

- [&LocalExternallyUsedValues, &TrackedVals](Value *V) {

- if (isa<Instruction>(V))

- LocalExternallyUsedValues[TrackedVals[V]];

- });

+ for (Value *V : ReducedVals[Cnt])

+ if (isa<Instruction>(V))

+ LocalExternallyUsedValues[TrackedVals[V]];

}

if (!IsSupportedHorRdxIdentityOp) {

// Number of uses of the candidates in the vector of values.

@@ -13591,7 +14784,7 @@ public:

// Update LocalExternallyUsedValues for the scalar, replaced by

// extractelement instructions.

for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {

- auto It = ExternallyUsedValues.find(Pair.first);

+ auto *It = ExternallyUsedValues.find(Pair.first);

if (It == ExternallyUsedValues.end())

continue;

LocalExternallyUsedValues[Pair.second].append(It->second);

@@ -13605,7 +14798,8 @@ public:

InstructionCost ReductionCost =

getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);

InstructionCost Cost = TreeCost + ReductionCost;

- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");

+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost

+ << " for reduction\n");

if (!Cost.isValid())

return nullptr;

if (Cost >= -SLPCostThreshold) {

@@ -13652,7 +14846,9 @@ public:

// To prevent poison from leaking across what used to be sequential,

// safe, scalar boolean logic operations, the reduction operand must be

// frozen.

- if (isBoolLogicOp(RdxRootInst))

+ if ((isBoolLogicOp(RdxRootInst) ||

+ (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&

+ !isGuaranteedNotToBePoison(VectorizedRoot))

VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

// Emit code to correctly handle reused reduced values, if required.

@@ -13664,6 +14860,16 @@ public:

Value *ReducedSubTree =

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

+ if (ReducedSubTree->getType() != VL.front()->getType()) {

+ ReducedSubTree = Builder.CreateIntCast(

+ ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {

+ KnownBits Known = computeKnownBits(

+ R, cast<Instruction>(ReductionOps.front().front())

+ ->getModule()

+ ->getDataLayout());

+ return !Known.isNonNegative();

+ }));

+ }

// Improved analysis for add/fadd/xor reductions with same scale factor

// for all operands of reductions. We can emit scalar ops for them

@@ -13716,31 +14922,33 @@ public:

// RedOp2 = select i1 ?, i1 RHS, i1 false

// Then, we must freeze LHS in the new op.

- auto &&FixBoolLogicalOps =

- [&Builder, VectorizedTree](Value *&LHS, Value *&RHS,

- Instruction *RedOp1, Instruction *RedOp2) {

- if (!isBoolLogicOp(RedOp1))

- return;

- if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||

- isGuaranteedNotToBePoison(LHS))

- return;

- if (!isBoolLogicOp(RedOp2))

- return;

- if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||

- isGuaranteedNotToBePoison(RHS)) {

- std::swap(LHS, RHS);

- return;

- }

- LHS = Builder.CreateFreeze(LHS);

- };

+ auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,

+ Instruction *RedOp1,

+ Instruction *RedOp2,

+ bool InitStep) {

+ if (!AnyBoolLogicOp)

+ return;

+ if (isBoolLogicOp(RedOp1) &&

+ ((!InitStep && LHS == VectorizedTree) ||

+ getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))

+ return;

+ if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||

+ getRdxOperand(RedOp2, 0) == RHS ||

+ isGuaranteedNotToBePoison(RHS))) {

+ std::swap(LHS, RHS);

+ return;

+ }

+ if (LHS != VectorizedTree)

+ LHS = Builder.CreateFreeze(LHS);

+ };

// Finish the reduction.

// Need to add extra arguments and not vectorized possible reduction

// values.

// Try to avoid dependencies between the scalar remainders after

// reductions.

- auto &&FinalGen =

- [this, &Builder, &TrackedVals, &FixBoolLogicalOps](

- ArrayRef<std::pair<Instruction *, Value *>> InstVals) {

+ auto FinalGen =

+ [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,

+ bool InitStep) {

unsigned Sz = InstVals.size();

SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

Sz % 2);

@@ -13761,7 +14969,7 @@ public:

// sequential, safe, scalar boolean logic operations, the

// reduction operand must be frozen.

FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

- RedOp);

+ RedOp, InitStep);

Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

StableRdxVal2, "op.rdx", ReductionOps);

ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

@@ -13791,11 +14999,13 @@ public:

ExtraReductions.emplace_back(I, Pair.first);

}

// Iterate through all not-vectorized reduction values/extra arguments.

+ bool InitStep = true;

while (ExtraReductions.size() > 1) {

VectorizedTree = ExtraReductions.front().second;

SmallVector<std::pair<Instruction *, Value *>> NewReds =

- FinalGen(ExtraReductions);

+ FinalGen(ExtraReductions, InitStep);

ExtraReductions.swap(NewReds);

+ InitStep = false;

}

VectorizedTree = ExtraReductions.front().second;

@@ -13842,8 +15052,7 @@ private:

bool IsCmpSelMinMax, unsigned ReduxWidth,

FastMathFlags FMF) {

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

- Value *FirstReducedVal = ReducedVals.front();

- Type *ScalarTy = FirstReducedVal->getType();

+ Type *ScalarTy = ReducedVals.front()->getType();

FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);

InstructionCost VectorCost = 0, ScalarCost;

// If all of the reduced values are constant, the vector cost is 0, since

@@ -13917,7 +15126,7 @@ private:

}

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost

- << " for reduction that starts with " << *FirstReducedVal

+ << " for reduction of " << shortBundleName(ReducedVals)

<< " (It is a splitting reduction)\n");

return VectorCost - ScalarCost;

}

@@ -13932,7 +15141,7 @@ private:

"A call to the llvm.fmuladd intrinsic is not handled yet");

++NumVectorInstructions;

- return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

+ return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);

}

/// Emits optimized code for unique scalar value reused \p Cnt times.

@@ -13979,8 +15188,8 @@ private:

case RecurKind::Mul:

case RecurKind::FMul:

case RecurKind::FMulAdd:

- case RecurKind::SelectICmp:

- case RecurKind::SelectFCmp:

+ case RecurKind::IAnyOf:

+ case RecurKind::FAnyOf:

case RecurKind::None:

llvm_unreachable("Unexpected reduction kind for repeated scalar.");

}

@@ -14068,8 +15277,8 @@ private:

case RecurKind::Mul:

case RecurKind::FMul:

case RecurKind::FMulAdd:

- case RecurKind::SelectICmp:

- case RecurKind::SelectFCmp:

+ case RecurKind::IAnyOf:

+ case RecurKind::FAnyOf:

case RecurKind::None:

llvm_unreachable("Unexpected reduction kind for reused scalars.");

}

@@ -14164,8 +15373,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst,

InsertElts.resize(*AggregateSize);

findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);

- llvm::erase_value(BuildVectorOpds, nullptr);

- llvm::erase_value(InsertElts, nullptr);

+ llvm::erase(BuildVectorOpds, nullptr);

+ llvm::erase(InsertElts, nullptr);

if (BuildVectorOpds.size() >= 2)

return true;

@@ -14401,8 +15610,7 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

BasicBlock *BB, BoUpSLP &R) {

- const DataLayout &DL = BB->getModule()->getDataLayout();

- if (!R.canMapToVector(IVI->getType(), DL))

+ if (!R.canMapToVector(IVI->getType()))

return false;

SmallVector<Value *, 16> BuildVectorOpds;

@@ -14541,11 +15749,11 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,

if (BasePred1 > BasePred2)

return false;

// Compare operands.

- bool LEPreds = Pred1 <= Pred2;

- bool GEPreds = Pred1 >= Pred2;

+ bool CI1Preds = Pred1 == BasePred1;

+ bool CI2Preds = Pred2 == BasePred1;

for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

- auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);

- auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);

+ auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);

+ auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);

if (Op1->getValueID() < Op2->getValueID())

return !IsCompatibility;

if (Op1->getValueID() > Op2->getValueID())

@@ -14691,14 +15899,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return true;

if (Opcodes1.size() > Opcodes2.size())

return false;

- std::optional<bool> ConstOrder;

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

// Undefs are compatible with any other value.

if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {

- if (!ConstOrder)

- ConstOrder =

- !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);

- continue;

+ if (isa<Instruction>(Opcodes1[I]))

+ return true;

+ if (isa<Instruction>(Opcodes2[I]))

+ return false;

+ if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]))

+ return true;

+ if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]))

+ return false;

+ if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]))

+ continue;

+ return isa<UndefValue>(Opcodes2[I]);

}

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

@@ -14714,21 +15928,26 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

if (NodeI1 != NodeI2)

return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

InstructionsState S = getSameOpcode({I1, I2}, *TLI);

- if (S.getOpcode())

+ if (S.getOpcode() && !S.isAltShuffle())

continue;

return I1->getOpcode() < I2->getOpcode();

}

- if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {

- if (!ConstOrder)

- ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

- continue;

- }

+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

+ return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

+ if (isa<Instruction>(Opcodes1[I]))

+ return true;

+ if (isa<Instruction>(Opcodes2[I]))

+ return false;

+ if (isa<Constant>(Opcodes1[I]))

+ return true;

+ if (isa<Constant>(Opcodes2[I]))

+ return false;

if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

return true;

if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

return false;

}

- return ConstOrder && *ConstOrder;

+ return false;

};

auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {

if (V1 == V2)

@@ -14776,6 +15995,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

Incoming.push_back(P);

}

+ if (Incoming.size() <= 1)

+ break;

// Find the corresponding non-phi nodes for better matching when trying to

// build the tree.

for (Value *V : Incoming) {

@@ -14838,41 +16060,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

return I->use_empty() &&

(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

};

- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {

// Skip instructions with scalable type. The num of elements is unknown at

// compile-time for scalable type.

- if (isa<ScalableVectorType>(it->getType()))

+ if (isa<ScalableVectorType>(It->getType()))

continue;

// Skip instructions marked for the deletion.

- if (R.isDeleted(&*it))

+ if (R.isDeleted(&*It))

continue;

// We may go through BB multiple times so skip the one we have checked.

- if (!VisitedInstrs.insert(&*it).second) {

- if (HasNoUsers(&*it) &&

- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator())) {

+ if (!VisitedInstrs.insert(&*It).second) {

+ if (HasNoUsers(&*It) &&

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

}

continue;

}

- if (isa<DbgInfoIntrinsic>(it))

+ if (isa<DbgInfoIntrinsic>(It))

continue;

// Try to vectorize reductions that use PHINodes.

- if (PHINode *P = dyn_cast<PHINode>(it)) {

+ if (PHINode *P = dyn_cast<PHINode>(It)) {

// Check that the PHI is a reduction PHI.

if (P->getNumIncomingValues() == 2) {

// Try to match and vectorize a horizontal reduction.

Instruction *Root = getReductionInstr(DT, P, BB, LI);

if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

continue;

}

@@ -14897,23 +16119,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

}

- if (HasNoUsers(&*it)) {

+ if (HasNoUsers(&*It)) {

bool OpsChanged = false;

- auto *SI = dyn_cast<StoreInst>(it);

+ auto *SI = dyn_cast<StoreInst>(It);

bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;

if (SI) {

- auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

+ auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

// Try to vectorize chain in store, if this is the only store to the

// address in the block.

// TODO: This is just a temporarily solution to save compile time. Need

// to investigate if we can safely turn on slp-vectorize-hor-store

// instead to allow lookup for reduction chains in all non-vectorized

// stores (need to check side effects and compile time).

- TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&

- SI->getValueOperand()->hasOneUse();

+ TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&

+ SI->getValueOperand()->hasOneUse();

}

if (TryToVectorizeRoot) {

- for (auto *V : it->operand_values()) {

+ for (auto *V : It->operand_values()) {

// Postponed instructions should not be vectorized here, delay their

// vectorization.

if (auto *VI = dyn_cast<Instruction>(V);

@@ -14926,21 +16148,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

// top-tree instructions to try to vectorize as many instructions as

// possible.

OpsChanged |=

- VectorizeInsertsAndCmps(/*VectorizeCmps=*/it->isTerminator());

+ VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());

if (OpsChanged) {

// We would like to start over since some instructions are deleted

// and the iterator may become invalid value.

Changed = true;

- it = BB->begin();

- e = BB->end();

+ It = BB->begin();

+ E = BB->end();

continue;

}

- if (isa<InsertElementInst, InsertValueInst>(it))

- PostProcessInserts.insert(&*it);

- else if (isa<CmpInst>(it))

- PostProcessCmps.insert(cast<CmpInst>(&*it));

+ if (isa<InsertElementInst, InsertValueInst>(It))

+ PostProcessInserts.insert(&*It);

+ else if (isa<CmpInst>(It))

+ PostProcessCmps.insert(cast<CmpInst>(&*It));

}

return Changed;

@@ -15044,6 +16266,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

// compatible (have the same opcode, same parent), otherwise it is

// definitely not profitable to try to vectorize them.

auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {

+ if (V->getValueOperand()->getType()->getTypeID() <

+ V2->getValueOperand()->getType()->getTypeID())

+ return true;

+ if (V->getValueOperand()->getType()->getTypeID() >

+ V2->getValueOperand()->getType()->getTypeID())

+ return false;

if (V->getPointerOperandType()->getTypeID() <

V2->getPointerOperandType()->getTypeID())

return true;

@@ -15082,6 +16310,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {

if (V1 == V2)

return true;

+ if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())

+ return false;

if (V1->getPointerOperandType() != V2->getPointerOperandType())

return false;

// Undefs are compatible with any other value.

@@ -15113,8 +16343,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

continue;

+ // Reverse stores to do bottom-to-top analysis. This is important if the

+ // values are stores to the same addresses several times, in this case need

+ // to follow the stores order (reversed to meet the memory dependecies).

+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),

+ Pair.second.rend());

Changed |= tryToVectorizeSequence<StoreInst>(

- Pair.second, StoreSorter, AreCompatibleStores,

+ ReversedStores, StoreSorter, AreCompatibleStores,

[this, &R](ArrayRef<StoreInst *> Candidates, bool) {

return vectorizeStores(Candidates, R);

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1271d1424c03..7ff6749a0908 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

@@ -133,9 +133,12 @@ public:

Ingredient2Recipe[I] = R;

}

+ /// Create the mask for the vector loop header block.

+ void createHeaderMask(VPlan &Plan);

/// A helper function that computes the predicate of the block BB, assuming

- /// that the header block of the loop is set to True. It returns the *entry*

- /// mask for the block BB.

+ /// that the header block of the loop is set to True or the loop mask when

+ /// tail folding. It returns the *entry* mask for the block BB.

VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);

/// A helper function that computes the predicate of the edge between SRC

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e81b88fd8099..1d7df9c9575a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -19,7 +19,6 @@

#include "VPlan.h"

#include "VPlanCFG.h"

#include "VPlanDominatorTree.h"

-#include "llvm/ADT/DepthFirstIterator.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallVector.h"

@@ -234,6 +233,99 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {

// set(Def, Extract, Instance);

return Extract;

}

+Value *VPTransformState::get(VPValue *Def, unsigned Part) {

+ // If Values have been set for this Def return the one relevant for \p Part.

+ if (hasVectorValue(Def, Part))

+ return Data.PerPartOutput[Def][Part];

+ auto GetBroadcastInstrs = [this, Def](Value *V) {

+ bool SafeToHoist = Def->isDefinedOutsideVectorRegions();

+ if (VF.isScalar())

+ return V;

+ // Place the code for broadcasting invariant variables in the new preheader.

+ IRBuilder<>::InsertPointGuard Guard(Builder);

+ if (SafeToHoist) {

+ BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(

+ Plan->getVectorLoopRegion()->getSinglePredecessor())];

+ if (LoopVectorPreHeader)

+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

+ }

+ // Place the code for broadcasting invariant variables in the new preheader.

+ // Broadcast the scalar into all locations in the vector.

+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

+ return Shuf;

+ };

+ if (!hasScalarValue(Def, {Part, 0})) {

+ assert(Def->isLiveIn() && "expected a live-in");

+ if (Part != 0)

+ return get(Def, 0);

+ Value *IRV = Def->getLiveInIRValue();

+ Value *B = GetBroadcastInstrs(IRV);

+ set(Def, B, Part);

+ return B;

+ }

+ Value *ScalarValue = get(Def, {Part, 0});

+ // If we aren't vectorizing, we can just copy the scalar map values over

+ // to the vector map.

+ if (VF.isScalar()) {

+ set(Def, ScalarValue, Part);

+ return ScalarValue;

+ }

+ bool IsUniform = vputils::isUniformAfterVectorization(Def);

+ unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;

+ // Check if there is a scalar value for the selected lane.

+ if (!hasScalarValue(Def, {Part, LastLane})) {

+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and

+ // VPExpandSCEVRecipes can also be uniform.

+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||

+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||

+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&

+ "unexpected recipe found to be invariant");

+ IsUniform = true;

+ LastLane = 0;

+ }

+ auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));

+ // Set the insert point after the last scalarized instruction or after the

+ // last PHI, if LastInst is a PHI. This ensures the insertelement sequence

+ // will directly follow the scalar definitions.

+ auto OldIP = Builder.saveIP();

+ auto NewIP =

+ isa<PHINode>(LastInst)

+ ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())

+ : std::next(BasicBlock::iterator(LastInst));

+ Builder.SetInsertPoint(&*NewIP);

+ // However, if we are vectorizing, we need to construct the vector values.

+ // If the value is known to be uniform after vectorization, we can just

+ // broadcast the scalar value corresponding to lane zero for each unroll

+ // iteration. Otherwise, we construct the vector values using

+ // insertelement instructions. Since the resulting vectors are stored in

+ // State, we will only generate the insertelements once.

+ Value *VectorValue = nullptr;

+ if (IsUniform) {

+ VectorValue = GetBroadcastInstrs(ScalarValue);

+ set(Def, VectorValue, Part);

+ } else {

+ // Initialize packing with insertelements to start from undef.

+ assert(!VF.isScalable() && "VF is assumed to be non scalable.");

+ Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));

+ set(Def, Undef, Part);

+ for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)

+ packScalarIntoVectorValue(Def, {Part, Lane});

+ VectorValue = get(Def, Part);

+ }

+ Builder.restoreIP(OldIP);

+ return VectorValue;

BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {

VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();

return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];

@@ -267,18 +359,15 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {

}

-void VPTransformState::setDebugLocFromInst(const Value *V) {

- const Instruction *Inst = dyn_cast<Instruction>(V);

- if (!Inst) {

- Builder.SetCurrentDebugLocation(DebugLoc());

- return;

- }

- const DILocation *DIL = Inst->getDebugLoc();

+void VPTransformState::setDebugLocFrom(DebugLoc DL) {

+ const DILocation *DIL = DL;

// When a FSDiscriminator is enabled, we don't need to add the multiply

// factors to the discriminators.

- if (DIL && Inst->getFunction()->shouldEmitDebugInfoForProfiling() &&

- !Inst->isDebugOrPseudoInst() && !EnableFSDiscriminator) {

+ if (DIL &&

+ Builder.GetInsertBlock()

+ ->getParent()

+ ->shouldEmitDebugInfoForProfiling() &&

+ !EnableFSDiscriminator) {

// FIXME: For scalable vectors, assume vscale=1.

auto NewDIL =

DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());

@@ -291,6 +380,15 @@ void VPTransformState::setDebugLocFromInst(const Value *V) {

Builder.SetCurrentDebugLocation(DIL);

}

+void VPTransformState::packScalarIntoVectorValue(VPValue *Def,

+ const VPIteration &Instance) {

+ Value *ScalarInst = get(Def, Instance);

+ Value *VectorValue = get(Def, Instance.Part);

+ VectorValue = Builder.CreateInsertElement(

+ VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));

+ set(Def, VectorValue, Instance.Part);

BasicBlock *

VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {

// BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.

@@ -616,22 +714,17 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {

auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);

Plan->TripCount =

vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);

+ // Create empty VPRegionBlock, to be filled during processing later.

+ auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/);

+ VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);

+ VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");

+ VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);

return Plan;

}

-VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {

- VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();

- for (VPRecipeBase &R : Header->phis()) {

- if (isa<VPActiveLaneMaskPHIRecipe>(&R))

- return cast<VPActiveLaneMaskPHIRecipe>(&R);

- }

- return nullptr;

void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

Value *CanonicalIVStartValue,

- VPTransformState &State,

- bool IsEpilogueVectorization) {

+ VPTransformState &State) {

// Check if the backedge taken count is needed, and if so build it.

if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {

IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());

@@ -648,6 +741,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)

State.set(&VectorTripCount, VectorTripCountV, Part);

+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());

+ // FIXME: Model VF * UF computation completely in VPlan.

+ State.set(&VFxUF,

+ createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),

+ 0);

// When vectorizing the epilogue loop, the canonical induction start value

// needs to be changed from zero to the value after the main vector loop.

// FIXME: Improve modeling for canonical IV start values in the epilogue loop.

@@ -656,16 +755,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

auto *IV = getCanonicalIV();

assert(all_of(IV->users(),

[](const VPUser *U) {

- if (isa<VPScalarIVStepsRecipe>(U) ||

- isa<VPDerivedIVRecipe>(U))

- return true;

- auto *VPI = cast<VPInstruction>(U);

- return VPI->getOpcode() ==

- VPInstruction::CanonicalIVIncrement ||

- VPI->getOpcode() ==

- VPInstruction::CanonicalIVIncrementNUW;

+ return isa<VPScalarIVStepsRecipe>(U) ||

+ isa<VPDerivedIVRecipe>(U) ||

+ cast<VPInstruction>(U)->getOpcode() ==

+ Instruction::Add;

}) &&

- "the canonical IV should only be used by its increments or "

+ "the canonical IV should only be used by its increment or "

"ScalarIVSteps when resetting the start value");

IV->setOperand(0, VPV);

}

@@ -754,11 +849,14 @@ void VPlan::execute(VPTransformState *State) {

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

-LLVM_DUMP_METHOD

-void VPlan::print(raw_ostream &O) const {

+void VPlan::printLiveIns(raw_ostream &O) const {

VPSlotTracker SlotTracker(this);

- O << "VPlan '" << getName() << "' {";

+ if (VFxUF.getNumUsers() > 0) {

+ O << "\nLive-in ";

+ VFxUF.printAsOperand(O, SlotTracker);

+ O << " = VF * UF";

+ }

if (VectorTripCount.getNumUsers() > 0) {

O << "\nLive-in ";

@@ -778,6 +876,15 @@ void VPlan::print(raw_ostream &O) const {

TripCount->printAsOperand(O, SlotTracker);

O << " = original trip-count";

O << "\n";

+LLVM_DUMP_METHOD

+void VPlan::print(raw_ostream &O) const {

+ VPSlotTracker SlotTracker(this);

+ O << "VPlan '" << getName() << "' {";

+ printLiveIns(O);

if (!getPreheader()->empty()) {

O << "\n";

@@ -895,11 +1002,18 @@ void VPlanPrinter::dump() {

OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";

if (!Plan.getName().empty())

OS << "\\n" << DOT::EscapeString(Plan.getName());

- if (Plan.BackedgeTakenCount) {

- OS << ", where:\\n";

- Plan.BackedgeTakenCount->print(OS, SlotTracker);

- OS << " := BackedgeTakenCount";

+ {

+ // Print live-ins.

+ std::string Str;

+ raw_string_ostream SS(Str);

+ Plan.printLiveIns(SS);

+ SmallVector<StringRef, 0> Lines;

+ StringRef(Str).rtrim('\n').split(Lines, "\n");

+ for (auto Line : Lines)

+ OS << DOT::EscapeString(Line.str()) << "\\n";

}

OS << "\"]\n";

OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";

OS << "edge [fontname=Courier, fontsize=30]\n";

@@ -1021,16 +1135,43 @@ void VPlanIngredient::print(raw_ostream &O) const {

template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);

void VPValue::replaceAllUsesWith(VPValue *New) {

+ if (this == New)

+ return;

for (unsigned J = 0; J < getNumUsers();) {

VPUser *User = Users[J];

- unsigned NumUsers = getNumUsers();

+ bool RemovedUser = false;

for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)

- if (User->getOperand(I) == this)

+ if (User->getOperand(I) == this) {

User->setOperand(I, New);

+ RemovedUser = true;

+ }

// If a user got removed after updating the current user, the next user to

// update will be moved to the current position, so we only need to

// increment the index if the number of users did not change.

- if (NumUsers == getNumUsers())

+ if (!RemovedUser)

+ J++;

+ }

+void VPValue::replaceUsesWithIf(

+ VPValue *New,

+ llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {

+ if (this == New)

+ return;

+ for (unsigned J = 0; J < getNumUsers();) {

+ VPUser *User = Users[J];

+ bool RemovedUser = false;

+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {

+ if (User->getOperand(I) != this || !ShouldReplace(*User, I))

+ continue;

+ RemovedUser = true;

+ User->setOperand(I, New);

+ }

+ // If a user got removed after updating the current user, the next user to

+ // update will be moved to the current position, so we only need to

+ // increment the index if the number of users did not change.

+ if (!RemovedUser)

J++;

}

@@ -1116,6 +1257,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {

}

void VPSlotTracker::assignSlots(const VPlan &Plan) {

+ if (Plan.VFxUF.getNumUsers() > 0)

+ assignSlot(&Plan.VFxUF);

assignSlot(&Plan.VectorTripCount);

if (Plan.BackedgeTakenCount)

assignSlot(Plan.BackedgeTakenCount);

@@ -1139,6 +1282,11 @@ bool vputils::onlyFirstLaneUsed(VPValue *Def) {

[Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });

}

+bool vputils::onlyFirstPartUsed(VPValue *Def) {

+ return all_of(Def->users(),

+ [Def](VPUser *U) { return U->onlyFirstPartUsed(Def); });

VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,

ScalarEvolution &SE) {

if (auto *Expanded = Plan.getSCEVExpansion(Expr))

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 73313465adea..94cb76889813 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -23,6 +23,7 @@

#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

+#include "VPlanAnalysis.h"

#include "VPlanValue.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/MapVector.h"

@@ -233,9 +234,9 @@ struct VPIteration {

struct VPTransformState {

VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,

DominatorTree *DT, IRBuilderBase &Builder,

- InnerLoopVectorizer *ILV, VPlan *Plan)

+ InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)

: VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),

- LVer(nullptr) {}

+ LVer(nullptr), TypeAnalysis(Ctx) {}

/// The chosen Vectorization and Unroll Factors of the loop being vectorized.

ElementCount VF;

@@ -274,10 +275,6 @@ struct VPTransformState {

I->second[Part];

}

- bool hasAnyVectorValue(VPValue *Def) const {

- return Data.PerPartOutput.contains(Def);

- }

bool hasScalarValue(VPValue *Def, VPIteration Instance) {

auto I = Data.PerPartScalars.find(Def);

if (I == Data.PerPartScalars.end())

@@ -349,8 +346,11 @@ struct VPTransformState {

/// vector of instructions.

void addMetadata(ArrayRef<Value *> To, Instruction *From);

- /// Set the debug location in the builder using the debug location in \p V.

- void setDebugLocFromInst(const Value *V);

+ /// Set the debug location in the builder using the debug location \p DL.

+ void setDebugLocFrom(DebugLoc DL);

+ /// Construct the vector value of a scalarized value \p V one lane at a time.

+ void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);

/// Hold state information used when constructing the CFG of the output IR,

/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.

@@ -410,6 +410,9 @@ struct VPTransformState {

/// Map SCEVs to their expanded values. Populated when executing

/// VPExpandSCEVRecipes.

DenseMap<const SCEV *, Value *> ExpandedSCEVs;

+ /// VPlan-based type analysis.

+ VPTypeAnalysis TypeAnalysis;

};

/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.

@@ -582,6 +585,8 @@ public:

/// This VPBlockBase must have no successors.

void setOneSuccessor(VPBlockBase *Successor) {

assert(Successors.empty() && "Setting one successor when others exist.");

+ assert(Successor->getParent() == getParent() &&

+ "connected blocks must have the same parent");

appendSuccessor(Successor);

}

@@ -693,7 +698,7 @@ public:

};

/// VPRecipeBase is a base class modeling a sequence of one or more output IR

-/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef

+/// instructions. VPRecipeBase owns the VPValues it defines through VPDef

/// and is responsible for deleting its defined values. Single-value

/// VPRecipeBases that also inherit from VPValue must make sure to inherit from

/// VPRecipeBase before VPValue.

@@ -706,13 +711,18 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,

/// Each VPRecipe belongs to a single VPBasicBlock.

VPBasicBlock *Parent = nullptr;

+ /// The debug location for the recipe.

+ DebugLoc DL;

public:

- VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands)

- : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}

+ VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,

+ DebugLoc DL = {})

+ : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}

template <typename IterT>

- VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands)

- : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}

+ VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands,

+ DebugLoc DL = {})

+ : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}

virtual ~VPRecipeBase() = default;

/// \return the VPBasicBlock which this VPRecipe belongs to.

@@ -789,6 +799,9 @@ public:

bool mayReadOrWriteMemory() const {

return mayReadFromMemory() || mayWriteToMemory();

}

+ /// Returns the debug location of the recipe.

+ DebugLoc getDebugLoc() const { return DL; }

};

// Helper macro to define common classof implementations for recipes.

@@ -808,153 +821,30 @@ public:

return R->getVPDefID() == VPDefID; \

}

-/// This is a concrete Recipe that models a single VPlan-level instruction.

-/// While as any Recipe it may generate a sequence of IR instructions when

-/// executed, these instructions would always form a single-def expression as

-/// the VPInstruction is also a single def-use vertex.

-class VPInstruction : public VPRecipeBase, public VPValue {

- friend class VPlanSlp;

-public:

- /// VPlan opcodes, extending LLVM IR with idiomatics instructions.

- enum {

- FirstOrderRecurrenceSplice =

- Instruction::OtherOpsEnd + 1, // Combines the incoming and previous

- // values of a first-order recurrence.

- Not,

- ICmpULE,

- SLPLoad,

- SLPStore,

- ActiveLaneMask,

- CalculateTripCountMinusVF,

- CanonicalIVIncrement,

- CanonicalIVIncrementNUW,

- // The next two are similar to the above, but instead increment the

- // canonical IV separately for each unrolled part.

- CanonicalIVIncrementForPart,

- CanonicalIVIncrementForPartNUW,

- BranchOnCount,

- BranchOnCond

- };

-private:

- typedef unsigned char OpcodeTy;

- OpcodeTy Opcode;

- FastMathFlags FMF;

- DebugLoc DL;

- /// An optional name that can be used for the generated IR instruction.

- const std::string Name;

- /// Utility method serving execute(): generates a single instance of the

- /// modeled instruction. \returns the generated value for \p Part.

- /// In some cases an existing value is returned rather than a generated

- /// one.

- Value *generateInstruction(VPTransformState &State, unsigned Part);

-protected:

- void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }

-public:

- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,

- const Twine &Name = "")

- : VPRecipeBase(VPDef::VPInstructionSC, Operands), VPValue(this),

- Opcode(Opcode), DL(DL), Name(Name.str()) {}

- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

- DebugLoc DL = {}, const Twine &Name = "")

- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}

- VP_CLASSOF_IMPL(VPDef::VPInstructionSC)

- VPInstruction *clone() const {

- SmallVector<VPValue *, 2> Operands(operands());

- return new VPInstruction(Opcode, Operands, DL, Name);

- }

- unsigned getOpcode() const { return Opcode; }

- /// Generate the instruction.

- /// TODO: We currently execute only per-part unless a specific instance is

- /// provided.

- void execute(VPTransformState &State) override;

-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

- /// Print the VPInstruction to \p O.

- void print(raw_ostream &O, const Twine &Indent,

- VPSlotTracker &SlotTracker) const override;

- /// Print the VPInstruction to dbgs() (for debugging).

- LLVM_DUMP_METHOD void dump() const;

-#endif

- /// Return true if this instruction may modify memory.

- bool mayWriteToMemory() const {

- // TODO: we can use attributes of the called function to rule out memory

- // modifications.

- return Opcode == Instruction::Store || Opcode == Instruction::Call ||

- Opcode == Instruction::Invoke || Opcode == SLPStore;

- }

- bool hasResult() const {

- // CallInst may or may not have a result, depending on the called function.

- // Conservatively return calls have results for now.

- switch (getOpcode()) {

- case Instruction::Ret:

- case Instruction::Br:

- case Instruction::Store:

- case Instruction::Switch:

- case Instruction::IndirectBr:

- case Instruction::Resume:

- case Instruction::CatchRet:

- case Instruction::Unreachable:

- case Instruction::Fence:

- case Instruction::AtomicRMW:

- case VPInstruction::BranchOnCond:

- case VPInstruction::BranchOnCount:

- return false;

- default:

- return true;

- }

- /// Set the fast-math flags.

- void setFastMathFlags(FastMathFlags FMFNew);

- /// Returns true if the recipe only uses the first lane of operand \p Op.

- bool onlyFirstLaneUsed(const VPValue *Op) const override {

- assert(is_contained(operands(), Op) &&

- "Op must be an operand of the recipe");

- if (getOperand(0) != Op)

- return false;

- switch (getOpcode()) {

- default:

- return false;

- case VPInstruction::ActiveLaneMask:

- case VPInstruction::CalculateTripCountMinusVF:

- case VPInstruction::CanonicalIVIncrement:

- case VPInstruction::CanonicalIVIncrementNUW:

- case VPInstruction::CanonicalIVIncrementForPart:

- case VPInstruction::CanonicalIVIncrementForPartNUW:

- case VPInstruction::BranchOnCount:

- return true;

- };

- llvm_unreachable("switch should return");

- }

-};

/// Class to record LLVM IR flag for a recipe along with it.

class VPRecipeWithIRFlags : public VPRecipeBase {

enum class OperationType : unsigned char {

+ Cmp,

OverflowingBinOp,

+ DisjointOp,

PossiblyExactOp,

GEPOp,

FPMathOp,

+ NonNegOp,

Other

};

+public:

struct WrapFlagsTy {

char HasNUW : 1;

char HasNSW : 1;

+ WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}

+ };

+private:

+ struct DisjointFlagsTy {

+ char IsDisjoint : 1;

};

struct ExactFlagsTy {

char IsExact : 1;

@@ -962,6 +852,9 @@ class VPRecipeWithIRFlags : public VPRecipeBase {

struct GEPFlagsTy {

char IsInBounds : 1;

};

+ struct NonNegFlagsTy {

+ char NonNeg : 1;

+ };

struct FastMathFlagsTy {

char AllowReassoc : 1;

char NoNaNs : 1;

@@ -970,56 +863,81 @@ class VPRecipeWithIRFlags : public VPRecipeBase {

char AllowReciprocal : 1;

char AllowContract : 1;

char ApproxFunc : 1;

+ FastMathFlagsTy(const FastMathFlags &FMF);

};

OperationType OpType;

union {

+ CmpInst::Predicate CmpPredicate;

WrapFlagsTy WrapFlags;

+ DisjointFlagsTy DisjointFlags;

ExactFlagsTy ExactFlags;

GEPFlagsTy GEPFlags;

+ NonNegFlagsTy NonNegFlags;

FastMathFlagsTy FMFs;

- unsigned char AllFlags;

+ unsigned AllFlags;

};

public:

template <typename IterT>

- VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands)

- : VPRecipeBase(SC, Operands) {

+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})

+ : VPRecipeBase(SC, Operands, DL) {

OpType = OperationType::Other;

AllFlags = 0;

}

template <typename IterT>

- VPRecipeWithIRFlags(const unsigned char SC, iterator_range<IterT> Operands,

- Instruction &I)

- : VPRecipeWithIRFlags(SC, Operands) {

- if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {

+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)

+ : VPRecipeWithIRFlags(SC, Operands, I.getDebugLoc()) {

+ if (auto *Op = dyn_cast<CmpInst>(&I)) {

+ OpType = OperationType::Cmp;

+ CmpPredicate = Op->getPredicate();

+ } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) {

+ OpType = OperationType::DisjointOp;

+ DisjointFlags.IsDisjoint = Op->isDisjoint();

+ } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {

OpType = OperationType::OverflowingBinOp;

- WrapFlags.HasNUW = Op->hasNoUnsignedWrap();

- WrapFlags.HasNSW = Op->hasNoSignedWrap();

+ WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};

} else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {

OpType = OperationType::PossiblyExactOp;

ExactFlags.IsExact = Op->isExact();

} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

OpType = OperationType::GEPOp;

GEPFlags.IsInBounds = GEP->isInBounds();

+ } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {

+ OpType = OperationType::NonNegOp;

+ NonNegFlags.NonNeg = PNNI->hasNonNeg();

} else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {

OpType = OperationType::FPMathOp;

- FastMathFlags FMF = Op->getFastMathFlags();

- FMFs.AllowReassoc = FMF.allowReassoc();

- FMFs.NoNaNs = FMF.noNaNs();

- FMFs.NoInfs = FMF.noInfs();

- FMFs.NoSignedZeros = FMF.noSignedZeros();

- FMFs.AllowReciprocal = FMF.allowReciprocal();

- FMFs.AllowContract = FMF.allowContract();

- FMFs.ApproxFunc = FMF.approxFunc();

+ FMFs = Op->getFastMathFlags();

}

+ template <typename IterT>

+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,

+ CmpInst::Predicate Pred, DebugLoc DL = {})

+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::Cmp),

+ CmpPredicate(Pred) {}

+ template <typename IterT>

+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,

+ WrapFlagsTy WrapFlags, DebugLoc DL = {})

+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::OverflowingBinOp),

+ WrapFlags(WrapFlags) {}

+ template <typename IterT>

+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,

+ FastMathFlags FMFs, DebugLoc DL = {})

+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp),

+ FMFs(FMFs) {}

static inline bool classof(const VPRecipeBase *R) {

- return R->getVPDefID() == VPRecipeBase::VPWidenSC ||

+ return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||

+ R->getVPDefID() == VPRecipeBase::VPWidenSC ||

R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||

+ R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||

R->getVPDefID() == VPRecipeBase::VPReplicateSC;

}

@@ -1032,6 +950,9 @@ public:

WrapFlags.HasNUW = false;

WrapFlags.HasNSW = false;

break;

+ case OperationType::DisjointOp:

+ DisjointFlags.IsDisjoint = false;

+ break;

case OperationType::PossiblyExactOp:

ExactFlags.IsExact = false;

break;

@@ -1042,6 +963,10 @@ public:

FMFs.NoNaNs = false;

FMFs.NoInfs = false;

break;

+ case OperationType::NonNegOp:

+ NonNegFlags.NonNeg = false;

+ break;

+ case OperationType::Cmp:

case OperationType::Other:

break;

}

@@ -1054,6 +979,9 @@ public:

I->setHasNoUnsignedWrap(WrapFlags.HasNUW);

I->setHasNoSignedWrap(WrapFlags.HasNSW);

break;

+ case OperationType::DisjointOp:

+ cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint);

+ break;

case OperationType::PossiblyExactOp:

I->setIsExact(ExactFlags.IsExact);

break;

@@ -1069,43 +997,209 @@ public:

I->setHasAllowContract(FMFs.AllowContract);

I->setHasApproxFunc(FMFs.ApproxFunc);

break;

+ case OperationType::NonNegOp:

+ I->setNonNeg(NonNegFlags.NonNeg);

+ break;

+ case OperationType::Cmp:

case OperationType::Other:

break;

}

+ CmpInst::Predicate getPredicate() const {

+ assert(OpType == OperationType::Cmp &&

+ "recipe doesn't have a compare predicate");

+ return CmpPredicate;

+ }

bool isInBounds() const {

assert(OpType == OperationType::GEPOp &&

"recipe doesn't have inbounds flag");

return GEPFlags.IsInBounds;

}

-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

- FastMathFlags getFastMathFlags() const {

- FastMathFlags Res;

- Res.setAllowReassoc(FMFs.AllowReassoc);

- Res.setNoNaNs(FMFs.NoNaNs);

- Res.setNoInfs(FMFs.NoInfs);

- Res.setNoSignedZeros(FMFs.NoSignedZeros);

- Res.setAllowReciprocal(FMFs.AllowReciprocal);

- Res.setAllowContract(FMFs.AllowContract);

- Res.setApproxFunc(FMFs.ApproxFunc);

- return Res;

+ /// Returns true if the recipe has fast-math flags.

+ bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }

+ FastMathFlags getFastMathFlags() const;

+ bool hasNoUnsignedWrap() const {

+ assert(OpType == OperationType::OverflowingBinOp &&

+ "recipe doesn't have a NUW flag");

+ return WrapFlags.HasNUW;

}

+ bool hasNoSignedWrap() const {

+ assert(OpType == OperationType::OverflowingBinOp &&

+ "recipe doesn't have a NSW flag");

+ return WrapFlags.HasNSW;

+ }

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void printFlags(raw_ostream &O) const;

#endif

};

+/// This is a concrete Recipe that models a single VPlan-level instruction.

+/// While as any Recipe it may generate a sequence of IR instructions when

+/// executed, these instructions would always form a single-def expression as

+/// the VPInstruction is also a single def-use vertex.

+class VPInstruction : public VPRecipeWithIRFlags, public VPValue {

+ friend class VPlanSlp;

+public:

+ /// VPlan opcodes, extending LLVM IR with idiomatics instructions.

+ enum {

+ FirstOrderRecurrenceSplice =

+ Instruction::OtherOpsEnd + 1, // Combines the incoming and previous

+ // values of a first-order recurrence.

+ Not,

+ SLPLoad,

+ SLPStore,

+ ActiveLaneMask,

+ CalculateTripCountMinusVF,

+ // Increment the canonical IV separately for each unrolled part.

+ CanonicalIVIncrementForPart,

+ BranchOnCount,

+ BranchOnCond

+ };

+private:

+ typedef unsigned char OpcodeTy;

+ OpcodeTy Opcode;

+ /// An optional name that can be used for the generated IR instruction.

+ const std::string Name;

+ /// Utility method serving execute(): generates a single instance of the

+ /// modeled instruction. \returns the generated value for \p Part.

+ /// In some cases an existing value is returned rather than a generated

+ /// one.

+ Value *generateInstruction(VPTransformState &State, unsigned Part);

+#if !defined(NDEBUG)

+ /// Return true if the VPInstruction is a floating point math operation, i.e.

+ /// has fast-math flags.

+ bool isFPMathOp() const;

+#endif

+protected:

+ void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }

+public:

+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,

+ const Twine &Name = "")

+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),

+ VPValue(this), Opcode(Opcode), Name(Name.str()) {}

+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

+ DebugLoc DL = {}, const Twine &Name = "")

+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}

+ VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A,

+ VPValue *B, DebugLoc DL = {}, const Twine &Name = "");

+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

+ WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")

+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),

+ VPValue(this), Opcode(Opcode), Name(Name.str()) {}

+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,

+ FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");

+ VP_CLASSOF_IMPL(VPDef::VPInstructionSC)

+ unsigned getOpcode() const { return Opcode; }

+ /// Generate the instruction.

+ /// TODO: We currently execute only per-part unless a specific instance is

+ /// provided.

+ void execute(VPTransformState &State) override;

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+ /// Print the VPInstruction to \p O.

+ void print(raw_ostream &O, const Twine &Indent,

+ VPSlotTracker &SlotTracker) const override;

+ /// Print the VPInstruction to dbgs() (for debugging).

+ LLVM_DUMP_METHOD void dump() const;

+#endif

+ /// Return true if this instruction may modify memory.

+ bool mayWriteToMemory() const {

+ // TODO: we can use attributes of the called function to rule out memory

+ // modifications.

+ return Opcode == Instruction::Store || Opcode == Instruction::Call ||

+ Opcode == Instruction::Invoke || Opcode == SLPStore;

+ }

+ bool hasResult() const {

+ // CallInst may or may not have a result, depending on the called function.

+ // Conservatively return calls have results for now.

+ switch (getOpcode()) {

+ case Instruction::Ret:

+ case Instruction::Br:

+ case Instruction::Store:

+ case Instruction::Switch:

+ case Instruction::IndirectBr:

+ case Instruction::Resume:

+ case Instruction::CatchRet:

+ case Instruction::Unreachable:

+ case Instruction::Fence:

+ case Instruction::AtomicRMW:

+ case VPInstruction::BranchOnCond:

+ case VPInstruction::BranchOnCount:

+ return false;

+ default:

+ return true;

+ }

+ /// Returns true if the recipe only uses the first lane of operand \p Op.

+ bool onlyFirstLaneUsed(const VPValue *Op) const override {

+ assert(is_contained(operands(), Op) &&

+ "Op must be an operand of the recipe");

+ if (getOperand(0) != Op)

+ return false;

+ switch (getOpcode()) {

+ default:

+ return false;

+ case VPInstruction::ActiveLaneMask:

+ case VPInstruction::CalculateTripCountMinusVF:

+ case VPInstruction::CanonicalIVIncrementForPart:

+ case VPInstruction::BranchOnCount:

+ return true;

+ };

+ llvm_unreachable("switch should return");

+ }

+ /// Returns true if the recipe only uses the first part of operand \p Op.

+ bool onlyFirstPartUsed(const VPValue *Op) const override {

+ assert(is_contained(operands(), Op) &&

+ "Op must be an operand of the recipe");

+ if (getOperand(0) != Op)

+ return false;

+ switch (getOpcode()) {

+ default:

+ return false;

+ case VPInstruction::BranchOnCount:

+ return true;

+ };

+ llvm_unreachable("switch should return");

+ }

+};

/// VPWidenRecipe is a recipe for producing a copy of vector type its

/// ingredient. This recipe covers most of the traditional vectorization cases

/// where each ingredient transforms into a vectorized version of itself.

class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {

+ unsigned Opcode;

public:

template <typename IterT>

VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)

- : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I) {}

+ : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I),

+ Opcode(I.getOpcode()) {}

~VPWidenRecipe() override = default;

@@ -1114,6 +1208,8 @@ public:

/// Produce widened copies of all Ingredients.

void execute(VPTransformState &State) override;

+ unsigned getOpcode() const { return Opcode; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

/// Print the recipe.

void print(raw_ostream &O, const Twine &Indent,

@@ -1122,7 +1218,7 @@ public:

};

/// VPWidenCastRecipe is a recipe to create vector cast instructions.

-class VPWidenCastRecipe : public VPRecipeBase, public VPValue {

+class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {

/// Cast instruction opcode.

Instruction::CastOps Opcode;

@@ -1131,15 +1227,19 @@ class VPWidenCastRecipe : public VPRecipeBase, public VPValue {

public:

VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,

- CastInst *UI = nullptr)

- : VPRecipeBase(VPDef::VPWidenCastSC, Op), VPValue(this, UI),

+ CastInst &UI)

+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPValue(this, &UI),

Opcode(Opcode), ResultTy(ResultTy) {

- assert((!UI || UI->getOpcode() == Opcode) &&

+ assert(UI.getOpcode() == Opcode &&

"opcode of underlying cast doesn't match");

- assert((!UI || UI->getType() == ResultTy) &&

+ assert(UI.getType() == ResultTy &&

"result type of underlying cast doesn't match");

}

+ VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)

+ : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPValue(this, nullptr),

+ Opcode(Opcode), ResultTy(ResultTy) {}

~VPWidenCastRecipe() override = default;

VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)

@@ -1196,7 +1296,8 @@ public:

struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {

template <typename IterT>

VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)

- : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I) {}

+ : VPRecipeBase(VPDef::VPWidenSelectSC, Operands, I.getDebugLoc()),

+ VPValue(this, &I) {}

~VPWidenSelectRecipe() override = default;

@@ -1282,8 +1383,8 @@ public:

class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {

protected:

VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,

- VPValue *Start = nullptr)

- : VPRecipeBase(VPDefID, {}), VPValue(this, UnderlyingInstr) {

+ VPValue *Start = nullptr, DebugLoc DL = {})

+ : VPRecipeBase(VPDefID, {}, DL), VPValue(this, UnderlyingInstr) {

if (Start)

addOperand(Start);

}

@@ -1404,7 +1505,7 @@ public:

bool isCanonical() const;

/// Returns the scalar type of the induction.

- const Type *getScalarType() const {

+ Type *getScalarType() const {

return Trunc ? Trunc->getType() : IV->getType();

}

};

@@ -1565,14 +1666,13 @@ public:

/// A recipe for vectorizing a phi-node as a sequence of mask-based select

/// instructions.

class VPBlendRecipe : public VPRecipeBase, public VPValue {

- PHINode *Phi;

public:

/// The blend operation is a User of the incoming values and of their

/// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value

/// might be incoming with a full mask for which there is no VPValue.

VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)

- : VPRecipeBase(VPDef::VPBlendSC, Operands), VPValue(this, Phi), Phi(Phi) {

+ : VPRecipeBase(VPDef::VPBlendSC, Operands, Phi->getDebugLoc()),

+ VPValue(this, Phi) {

assert(Operands.size() > 0 &&

((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&

"Expected either a single incoming value or a positive even number "

@@ -1701,16 +1801,13 @@ public:

/// The Operands are {ChainOp, VecOp, [Condition]}.

class VPReductionRecipe : public VPRecipeBase, public VPValue {

/// The recurrence decriptor for the reduction in question.

- const RecurrenceDescriptor *RdxDesc;

- /// Pointer to the TTI, needed to create the target reduction

- const TargetTransformInfo *TTI;

+ const RecurrenceDescriptor &RdxDesc;

public:

- VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,

- VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,

- const TargetTransformInfo *TTI)

+ VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,

+ VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp)

: VPRecipeBase(VPDef::VPReductionSC, {ChainOp, VecOp}), VPValue(this, I),

- RdxDesc(R), TTI(TTI) {

+ RdxDesc(R) {

if (CondOp)

addOperand(CondOp);

}

@@ -2008,11 +2105,9 @@ public:

/// loop). VPWidenCanonicalIVRecipe represents the vector version of the

/// canonical induction variable.

class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {

- DebugLoc DL;

public:

VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)

- : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV), DL(DL) {}

+ : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}

~VPCanonicalIVPHIRecipe() override = default;

@@ -2032,8 +2127,8 @@ public:

#endif

/// Returns the scalar type of the induction.

- const Type *getScalarType() const {

- return getOperand(0)->getLiveInIRValue()->getType();

+ Type *getScalarType() const {

+ return getStartValue()->getLiveInIRValue()->getType();

}

/// Returns true if the recipe only uses the first lane of operand \p Op.

@@ -2043,6 +2138,13 @@ public:

return true;

}

+ /// Returns true if the recipe only uses the first part of operand \p Op.

+ bool onlyFirstPartUsed(const VPValue *Op) const override {

+ assert(is_contained(operands(), Op) &&

+ "Op must be an operand of the recipe");

+ return true;

+ }

/// Check if the induction described by \p Kind, /p Start and \p Step is

/// canonical, i.e. has the same start, step (of 1), and type as the

/// canonical IV.

@@ -2055,12 +2157,10 @@ public:

/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and

/// remove VPActiveLaneMaskPHIRecipe.

class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {

- DebugLoc DL;

public:

VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)

- : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask),

- DL(DL) {}

+ : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,

+ DL) {}

~VPActiveLaneMaskPHIRecipe() override = default;

@@ -2113,19 +2213,24 @@ public:

/// an IV with different start and step values, using Start + CanonicalIV *

/// Step.

class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {

- /// The type of the result value. It may be smaller than the type of the

- /// induction and in this case it will get truncated to ResultTy.

- Type *ResultTy;

+ /// If not nullptr, the result of the induction will get truncated to

+ /// TruncResultTy.

+ Type *TruncResultTy;

- /// Induction descriptor for the induction the canonical IV is transformed to.

- const InductionDescriptor &IndDesc;

+ /// Kind of the induction.

+ const InductionDescriptor::InductionKind Kind;

+ /// If not nullptr, the floating point induction binary operator. Must be set

+ /// for floating point inductions.

+ const FPMathOperator *FPBinOp;

public:

VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,

VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,

- Type *ResultTy)

+ Type *TruncResultTy)

: VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),

- VPValue(this), ResultTy(ResultTy), IndDesc(IndDesc) {}

+ VPValue(this), TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),

+ FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {

+ }

~VPDerivedIVRecipe() override = default;

@@ -2141,6 +2246,11 @@ public:

VPSlotTracker &SlotTracker) const override;

#endif

+ Type *getScalarType() const {

+ return TruncResultTy ? TruncResultTy

+ : getStartValue()->getLiveInIRValue()->getType();

+ }

VPValue *getStartValue() const { return getOperand(0); }

VPValue *getCanonicalIV() const { return getOperand(1); }

VPValue *getStepValue() const { return getOperand(2); }

@@ -2155,14 +2265,23 @@ public:

/// A recipe for handling phi nodes of integer and floating-point inductions,

/// producing their scalar values.

-class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {

- const InductionDescriptor &IndDesc;

+class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue {

+ Instruction::BinaryOps InductionOpcode;

public:

+ VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,

+ Instruction::BinaryOps Opcode, FastMathFlags FMFs)

+ : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,

+ ArrayRef<VPValue *>({IV, Step}), FMFs),

+ VPValue(this), InductionOpcode(Opcode) {}

VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,

VPValue *Step)

- : VPRecipeBase(VPDef::VPScalarIVStepsSC, {IV, Step}), VPValue(this),

- IndDesc(IndDesc) {}

+ : VPScalarIVStepsRecipe(

+ IV, Step, IndDesc.getInductionOpcode(),

+ dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())

+ ? IndDesc.getInductionBinOp()->getFastMathFlags()

+ : FastMathFlags()) {}

~VPScalarIVStepsRecipe() override = default;

@@ -2445,6 +2564,9 @@ class VPlan {

/// Represents the vector trip count.

VPValue VectorTripCount;

+ /// Represents the loop-invariant VF * UF of the vector loop region.

+ VPValue VFxUF;

/// Holds a mapping between Values and their corresponding VPValue inside

/// VPlan.

Value2VPValueTy Value2VPValue;

@@ -2490,15 +2612,17 @@ public:

~VPlan();

- /// Create an initial VPlan with preheader and entry blocks. Creates a

- /// VPExpandSCEVRecipe for \p TripCount and uses it as plan's trip count.

+ /// Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping

+ /// original scalar pre-header) which contains SCEV expansions that need to

+ /// happen before the CFG is modified; a VPBasicBlock for the vector

+ /// pre-header, followed by a region for the vector loop, followed by the

+ /// middle VPBasicBlock.

static VPlanPtr createInitialVPlan(const SCEV *TripCount,

ScalarEvolution &PSE);

/// Prepare the plan for execution, setting up the required live-in values.

void prepareToExecute(Value *TripCount, Value *VectorTripCount,

- Value *CanonicalIVStartValue, VPTransformState &State,

- bool IsEpilogueVectorization);

+ Value *CanonicalIVStartValue, VPTransformState &State);

/// Generate the IR code for this VPlan.

void execute(VPTransformState *State);

@@ -2522,6 +2646,9 @@ public:

/// The vector trip count.

VPValue &getVectorTripCount() { return VectorTripCount; }

+ /// Returns VF * UF of the vector loop region.

+ VPValue &getVFxUF() { return VFxUF; }

/// Mark the plan to indicate that using Value2VPValue is not safe any

/// longer, because it may be stale.

void disableValue2VPValue() { Value2VPValueEnabled = false; }

@@ -2583,13 +2710,10 @@ public:

return getVPValue(V);

}

- void removeVPValueFor(Value *V) {

- assert(Value2VPValueEnabled &&

- "IR value to VPValue mapping may be out of date!");

- Value2VPValue.erase(V);

- }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+ /// Print the live-ins of this VPlan to \p O.

+ void printLiveIns(raw_ostream &O) const;

/// Print this VPlan to \p O.

void print(raw_ostream &O) const;

@@ -2628,10 +2752,6 @@ public:

return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());

}

- /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there

- /// be only one at most. If there isn't one, then return nullptr.

- VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();

void addLiveOut(PHINode *PN, VPValue *V);

void removeLiveOut(PHINode *PN) {

@@ -2959,6 +3079,9 @@ namespace vputils {

/// Returns true if only the first lane of \p Def is used.

bool onlyFirstLaneUsed(VPValue *Def);

+/// Returns true if only the first part of \p Def is used.

+bool onlyFirstPartUsed(VPValue *Def);

/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p

/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in

/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
new file mode 100644
index 000000000000..97a8a1803bbf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

@@ -0,0 +1,237 @@

+//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+#include "VPlanAnalysis.h"

+#include "VPlan.h"

+#include "llvm/ADT/TypeSwitch.h"

+using namespace llvm;

+#define DEBUG_TYPE "vplan"

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) {

+ Type *ResTy = inferScalarType(R->getIncomingValue(0));

+ for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) {

+ VPValue *Inc = R->getIncomingValue(I);

+ assert(inferScalarType(Inc) == ResTy &&

+ "different types inferred for different incoming values");

+ CachedTypes[Inc] = ResTy;

+ }

+ return ResTy;

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {

+ switch (R->getOpcode()) {

+ case Instruction::Select: {

+ Type *ResTy = inferScalarType(R->getOperand(1));

+ VPValue *OtherV = R->getOperand(2);

+ assert(inferScalarType(OtherV) == ResTy &&

+ "different types inferred for different operands");

+ CachedTypes[OtherV] = ResTy;

+ return ResTy;

+ }

+ case VPInstruction::FirstOrderRecurrenceSplice: {

+ Type *ResTy = inferScalarType(R->getOperand(0));

+ VPValue *OtherV = R->getOperand(1);

+ assert(inferScalarType(OtherV) == ResTy &&

+ "different types inferred for different operands");

+ CachedTypes[OtherV] = ResTy;

+ return ResTy;

+ }

+ default:

+ break;

+ }

+ // Type inference not implemented for opcode.

+ LLVM_DEBUG({

+ dbgs() << "LV: Found unhandled opcode for: ";

+ R->getVPSingleValue()->dump();

+ });

+ llvm_unreachable("Unhandled opcode!");

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {

+ unsigned Opcode = R->getOpcode();

+ switch (Opcode) {

+ case Instruction::ICmp:

+ case Instruction::FCmp:

+ return IntegerType::get(Ctx, 1);

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

+ case Instruction::URem:

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::FDiv:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor: {

+ Type *ResTy = inferScalarType(R->getOperand(0));

+ assert(ResTy == inferScalarType(R->getOperand(1)) &&

+ "types for both operands must match for binary op");

+ CachedTypes[R->getOperand(1)] = ResTy;

+ return ResTy;

+ }

+ case Instruction::FNeg:

+ case Instruction::Freeze:

+ return inferScalarType(R->getOperand(0));

+ default:

+ break;

+ }

+ // Type inference not implemented for opcode.

+ LLVM_DEBUG({

+ dbgs() << "LV: Found unhandled opcode for: ";

+ R->getVPSingleValue()->dump();

+ });

+ llvm_unreachable("Unhandled opcode!");

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {

+ auto &CI = *cast<CallInst>(R->getUnderlyingInstr());

+ return CI.getType();

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(

+ const VPWidenMemoryInstructionRecipe *R) {

+ assert(!R->isStore() && "Store recipes should not define any values");

+ return cast<LoadInst>(&R->getIngredient())->getType();

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) {

+ Type *ResTy = inferScalarType(R->getOperand(1));

+ VPValue *OtherV = R->getOperand(2);

+ assert(inferScalarType(OtherV) == ResTy &&

+ "different types inferred for different operands");

+ CachedTypes[OtherV] = ResTy;

+ return ResTy;

+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {

+ switch (R->getUnderlyingInstr()->getOpcode()) {

+ case Instruction::Call: {

+ unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);

+ return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())

+ ->getReturnType();

+ }

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

+ case Instruction::URem:

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::FDiv:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor: {

+ Type *ResTy = inferScalarType(R->getOperand(0));

+ assert(ResTy == inferScalarType(R->getOperand(1)) &&

+ "inferred types for operands of binary op don't match");

+ CachedTypes[R->getOperand(1)] = ResTy;

+ return ResTy;

+ }

+ case Instruction::Select: {

+ Type *ResTy = inferScalarType(R->getOperand(1));

+ assert(ResTy == inferScalarType(R->getOperand(2)) &&

+ "inferred types for operands of select op don't match");

+ CachedTypes[R->getOperand(2)] = ResTy;

+ return ResTy;

+ }

+ case Instruction::ICmp:

+ case Instruction::FCmp:

+ return IntegerType::get(Ctx, 1);

+ case Instruction::Alloca:

+ case Instruction::BitCast:

+ case Instruction::Trunc:

+ case Instruction::SExt:

+ case Instruction::ZExt:

+ case Instruction::FPExt:

+ case Instruction::FPTrunc:

+ case Instruction::ExtractValue:

+ case Instruction::SIToFP:

+ case Instruction::UIToFP:

+ case Instruction::FPToSI:

+ case Instruction::FPToUI:

+ case Instruction::PtrToInt:

+ case Instruction::IntToPtr:

+ return R->getUnderlyingInstr()->getType();

+ case Instruction::Freeze:

+ case Instruction::FNeg:

+ case Instruction::GetElementPtr:

+ return inferScalarType(R->getOperand(0));

+ case Instruction::Load:

+ return cast<LoadInst>(R->getUnderlyingInstr())->getType();

+ case Instruction::Store:

+ // FIXME: VPReplicateRecipes with store opcodes still define a result

+ // VPValue, so we need to handle them here. Remove the code here once this

+ // is modeled accurately in VPlan.

+ return Type::getVoidTy(Ctx);

+ default:

+ break;

+ }

+ // Type inference not implemented for opcode.

+ LLVM_DEBUG({

+ dbgs() << "LV: Found unhandled opcode for: ";

+ R->getVPSingleValue()->dump();

+ });

+ llvm_unreachable("Unhandled opcode");

+Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {

+ if (Type *CachedTy = CachedTypes.lookup(V))

+ return CachedTy;

+ if (V->isLiveIn())

+ return V->getLiveInIRValue()->getType();

+ Type *ResultTy =

+ TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())

+ .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,

+ VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(

+ [this](const auto *R) {

+ // Handle header phi recipes, except VPWienIntOrFpInduction

+ // which needs special handling due it being possibly truncated.

+ // TODO: consider inferring/caching type of siblings, e.g.,

+ // backedge value, here and in cases below.

+ return inferScalarType(R->getStartValue());

+ })

+ .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(

+ [](const auto *R) { return R->getScalarType(); })

+ .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,

+ VPWidenGEPRecipe>([this](const VPRecipeBase *R) {

+ return inferScalarType(R->getOperand(0));

+ })

+ .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,

+ VPWidenCallRecipe, VPWidenMemoryInstructionRecipe,

+ VPWidenSelectRecipe>(

+ [this](const auto *R) { return inferScalarTypeForRecipe(R); })

+ .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {

+ // TODO: Use info from interleave group.

+ return V->getUnderlyingValue()->getType();

+ })

+ .Case<VPWidenCastRecipe>(

+ [](const VPWidenCastRecipe *R) { return R->getResultType(); });

+ assert(ResultTy && "could not infer type for the given VPValue");

+ CachedTypes[V] = ResultTy;

+ return ResultTy;

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
new file mode 100644
index 000000000000..7276641551ae
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

@@ -0,0 +1,61 @@

+//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H

+#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H

+#include "llvm/ADT/DenseMap.h"

+namespace llvm {

+class LLVMContext;

+class VPValue;

+class VPBlendRecipe;

+class VPInstruction;

+class VPWidenRecipe;

+class VPWidenCallRecipe;

+class VPWidenIntOrFpInductionRecipe;

+class VPWidenMemoryInstructionRecipe;

+struct VPWidenSelectRecipe;

+class VPReplicateRecipe;

+class Type;

+/// An analysis for type-inference for VPValues.

+/// It infers the scalar type for a given VPValue by bottom-up traversing

+/// through defining recipes until root nodes with known types are reached (e.g.

+/// live-ins or load recipes). The types are then propagated top down through

+/// operations.

+/// Note that the analysis caches the inferred types. A new analysis object must

+/// be constructed once a VPlan has been modified in a way that invalidates any

+/// of the previously inferred types.

+class VPTypeAnalysis {

+ DenseMap<const VPValue *, Type *> CachedTypes;

+ LLVMContext &Ctx;

+ Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPInstruction *R);

+ Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);

+ Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);

+public:

+ VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {}

+ /// Infer the type of \p V. Returns the scalar type of \p V.

+ Type *inferScalarType(const VPValue *V);

+ /// Return the LLVMContext used by the analysis.

+ LLVMContext &getContext() { return Ctx; }

+};

+} // end namespace llvm

+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index f6e3a2a16db8..f950d4740e41 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

@@ -61,6 +61,7 @@ private:

// Utility functions.

void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);

+ void setRegionPredsFromBB(VPRegionBlock *VPBB, BasicBlock *BB);

void fixPhiNodes();

VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);

#ifndef NDEBUG

@@ -81,14 +82,43 @@ public:

// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB

// must have no predecessors.

void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {

- SmallVector<VPBlockBase *, 8> VPBBPreds;

+ auto GetLatchOfExit = [this](BasicBlock *BB) -> BasicBlock * {

+ auto *SinglePred = BB->getSinglePredecessor();

+ Loop *LoopForBB = LI->getLoopFor(BB);

+ if (!SinglePred || LI->getLoopFor(SinglePred) == LoopForBB)

+ return nullptr;

+ // The input IR must be in loop-simplify form, ensuring a single predecessor

+ // for exit blocks.

+ assert(SinglePred == LI->getLoopFor(SinglePred)->getLoopLatch() &&

+ "SinglePred must be the only loop latch");

+ return SinglePred;

+ };

+ if (auto *LatchBB = GetLatchOfExit(BB)) {

+ auto *PredRegion = getOrCreateVPBB(LatchBB)->getParent();

+ assert(VPBB == cast<VPBasicBlock>(PredRegion->getSingleSuccessor()) &&

+ "successor must already be set for PredRegion; it must have VPBB "

+ "as single successor");

+ VPBB->setPredecessors({PredRegion});

+ return;

+ }

// Collect VPBB predecessors.

+ SmallVector<VPBlockBase *, 2> VPBBPreds;

for (BasicBlock *Pred : predecessors(BB))

VPBBPreds.push_back(getOrCreateVPBB(Pred));

VPBB->setPredecessors(VPBBPreds);

}

+static bool isHeaderBB(BasicBlock *BB, Loop *L) {

+ return L && BB == L->getHeader();

+void PlainCFGBuilder::setRegionPredsFromBB(VPRegionBlock *Region,

+ BasicBlock *BB) {

+ // BB is a loop header block. Connect the region to the loop preheader.

+ Loop *LoopOfBB = LI->getLoopFor(BB);

+ Region->setPredecessors({getOrCreateVPBB(LoopOfBB->getLoopPredecessor())});

// Add operands to VPInstructions representing phi nodes from the input IR.

void PlainCFGBuilder::fixPhiNodes() {

for (auto *Phi : PhisToFix) {

@@ -100,38 +130,85 @@ void PlainCFGBuilder::fixPhiNodes() {

assert(VPPhi->getNumOperands() == 0 &&

"Expected VPInstruction with no operands.");

+ Loop *L = LI->getLoopFor(Phi->getParent());

+ if (isHeaderBB(Phi->getParent(), L)) {

+ // For header phis, make sure the incoming value from the loop

+ // predecessor is the first operand of the recipe.

+ assert(Phi->getNumOperands() == 2);

+ BasicBlock *LoopPred = L->getLoopPredecessor();

+ VPPhi->addIncoming(

+ getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)),

+ BB2VPBB[LoopPred]);

+ BasicBlock *LoopLatch = L->getLoopLatch();

+ VPPhi->addIncoming(

+ getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)),

+ BB2VPBB[LoopLatch]);

+ continue;

+ }

for (unsigned I = 0; I != Phi->getNumOperands(); ++I)

VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)),

BB2VPBB[Phi->getIncomingBlock(I)]);

}

+static bool isHeaderVPBB(VPBasicBlock *VPBB) {

+ return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;

+/// Return true of \p L loop is contained within \p OuterLoop.

+static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) {

+ if (L->getLoopDepth() < OuterLoop->getLoopDepth())

+ return false;

+ const Loop *P = L;

+ while (P) {

+ if (P == OuterLoop)

+ return true;

+ P = P->getParentLoop();

+ }

+ return false;

// Create a new empty VPBasicBlock for an incoming BasicBlock in the region

// corresponding to the containing loop or retrieve an existing one if it was

// already created. If no region exists yet for the loop containing \p BB, a new

// one is created.

VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {

- auto BlockIt = BB2VPBB.find(BB);

- if (BlockIt != BB2VPBB.end())

+ if (auto *VPBB = BB2VPBB.lookup(BB)) {

// Retrieve existing VPBB.

- return BlockIt->second;

- // Get or create a region for the loop containing BB.

- Loop *CurrentLoop = LI->getLoopFor(BB);

- VPRegionBlock *ParentR = nullptr;

- if (CurrentLoop) {

- auto Iter = Loop2Region.insert({CurrentLoop, nullptr});

- if (Iter.second)

- Iter.first->second = new VPRegionBlock(

- CurrentLoop->getHeader()->getName().str(), false /*isReplicator*/);

- ParentR = Iter.first->second;

+ return VPBB;

}

// Create new VPBB.

- LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");

- VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());

+ StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();

+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");

+ VPBasicBlock *VPBB = new VPBasicBlock(Name);

BB2VPBB[BB] = VPBB;

- VPBB->setParent(ParentR);

+ // Get or create a region for the loop containing BB.

+ Loop *LoopOfBB = LI->getLoopFor(BB);

+ if (!LoopOfBB || !doesContainLoop(LoopOfBB, TheLoop))

+ return VPBB;

+ auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB);

+ if (!isHeaderBB(BB, LoopOfBB)) {

+ assert(RegionOfVPBB &&

+ "Region should have been created by visiting header earlier");

+ VPBB->setParent(RegionOfVPBB);

+ return VPBB;

+ }

+ assert(!RegionOfVPBB &&

+ "First visit of a header basic block expects to register its region.");

+ // Handle a header - take care of its Region.

+ if (LoopOfBB == TheLoop) {

+ RegionOfVPBB = Plan.getVectorLoopRegion();

+ } else {

+ RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);

+ RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);

+ }

+ RegionOfVPBB->setEntry(VPBB);

+ Loop2Region[LoopOfBB] = RegionOfVPBB;

return VPBB;

}

@@ -254,6 +331,25 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,

// Main interface to build the plain CFG.

void PlainCFGBuilder::buildPlainCFG() {

+ // 0. Reuse the top-level region, vector-preheader and exit VPBBs from the

+ // skeleton. These were created directly rather than via getOrCreateVPBB(),

+ // revisit them now to update BB2VPBB. Note that header/entry and

+ // latch/exiting VPBB's of top-level region have yet to be created.

+ VPRegionBlock *TheRegion = Plan.getVectorLoopRegion();

+ BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();

+ assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&

+ "Unexpected loop preheader");

+ auto *VectorPreheaderVPBB =

+ cast<VPBasicBlock>(TheRegion->getSinglePredecessor());

+ // ThePreheaderBB conceptually corresponds to both Plan.getPreheader() (which

+ // wraps the original preheader BB) and Plan.getEntry() (which represents the

+ // new vector preheader); here we're interested in setting BB2VPBB to the

+ // latter.

+ BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB;

+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();

+ assert(LoopExitBB && "Loops with multiple exits are not supported.");

+ BB2VPBB[LoopExitBB] = cast<VPBasicBlock>(TheRegion->getSingleSuccessor());

// 1. Scan the body of the loop in a topological order to visit each basic

// block after having visited its predecessor basic blocks. Create a VPBB for

// each BB and link it to its successor and predecessor VPBBs. Note that

@@ -263,21 +359,11 @@ void PlainCFGBuilder::buildPlainCFG() {

// Loop PH needs to be explicitly visited since it's not taken into account by

// LoopBlocksDFS.

- BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();

- assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&

- "Unexpected loop preheader");

- VPBasicBlock *ThePreheaderVPBB = Plan.getEntry();

- BB2VPBB[ThePreheaderBB] = ThePreheaderVPBB;

- ThePreheaderVPBB->setName("vector.ph");

for (auto &I : *ThePreheaderBB) {

if (I.getType()->isVoidTy())

continue;

IRDef2VPValue[&I] = Plan.getVPValueOrAddLiveIn(&I);

}

- // Create empty VPBB for Loop H so that we can link PH->H.

- VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());

- HeaderVPBB->setName("vector.body");

- ThePreheaderVPBB->setOneSuccessor(HeaderVPBB);

LoopBlocksRPO RPO(TheLoop);

RPO.perform(LI);

@@ -286,88 +372,55 @@ void PlainCFGBuilder::buildPlainCFG() {

// Create or retrieve the VPBasicBlock for this BB and create its

// VPInstructions.

VPBasicBlock *VPBB = getOrCreateVPBB(BB);

+ VPRegionBlock *Region = VPBB->getParent();

createVPInstructionsForVPBB(VPBB, BB);

+ Loop *LoopForBB = LI->getLoopFor(BB);

+ // Set VPBB predecessors in the same order as they are in the incoming BB.

+ if (!isHeaderBB(BB, LoopForBB)) {

+ setVPBBPredsFromBB(VPBB, BB);

+ } else {

+ // BB is a loop header, set the predecessor for the region, except for the

+ // top region, whose predecessor was set when creating VPlan's skeleton.

+ assert(isHeaderVPBB(VPBB) && "isHeaderBB and isHeaderVPBB disagree");

+ if (TheRegion != Region)

+ setRegionPredsFromBB(Region, BB);

+ }

// Set VPBB successors. We create empty VPBBs for successors if they don't

// exist already. Recipes will be created when the successor is visited

// during the RPO traversal.

- Instruction *TI = BB->getTerminator();

- assert(TI && "Terminator expected.");

- unsigned NumSuccs = TI->getNumSuccessors();

+ auto *BI = cast<BranchInst>(BB->getTerminator());

+ unsigned NumSuccs = succ_size(BB);

if (NumSuccs == 1) {

- VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));

- assert(SuccVPBB && "VPBB Successor not found.");

- VPBB->setOneSuccessor(SuccVPBB);

- } else if (NumSuccs == 2) {

- VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));

- assert(SuccVPBB0 && "Successor 0 not found.");

- VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));

- assert(SuccVPBB1 && "Successor 1 not found.");

- // Get VPBB's condition bit.

- assert(isa<BranchInst>(TI) && "Unsupported terminator!");

- // Look up the branch condition to get the corresponding VPValue

- // representing the condition bit in VPlan (which may be in another VPBB).

- assert(IRDef2VPValue.count(cast<BranchInst>(TI)->getCondition()) &&

- "Missing condition bit in IRDef2VPValue!");

- // Link successors.

- VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1);

- } else

- llvm_unreachable("Number of successors not supported.");

- // Set VPBB predecessors in the same order as they are in the incoming BB.

- setVPBBPredsFromBB(VPBB, BB);

+ auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor());

+ VPBB->setOneSuccessor(isHeaderVPBB(Successor)

+ ? Successor->getParent()

+ : static_cast<VPBlockBase *>(Successor));

+ continue;

+ }

+ assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&

+ "block must have conditional branch with 2 successors");

+ // Look up the branch condition to get the corresponding VPValue

+ // representing the condition bit in VPlan (which may be in another VPBB).

+ assert(IRDef2VPValue.contains(BI->getCondition()) &&

+ "Missing condition bit in IRDef2VPValue!");

+ VPBasicBlock *Successor0 = getOrCreateVPBB(BI->getSuccessor(0));

+ VPBasicBlock *Successor1 = getOrCreateVPBB(BI->getSuccessor(1));

+ if (!LoopForBB || BB != LoopForBB->getLoopLatch()) {

+ VPBB->setTwoSuccessors(Successor0, Successor1);

+ continue;

+ }

+ // For a latch we need to set the successor of the region rather than that

+ // of VPBB and it should be set to the exit, i.e., non-header successor,

+ // except for the top region, whose successor was set when creating VPlan's

+ // skeleton.

+ if (TheRegion != Region)

+ Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1

+ : Successor0);

+ Region->setExiting(VPBB);

}

- // 2. Process outermost loop exit. We created an empty VPBB for the loop

- // single exit BB during the RPO traversal of the loop body but Instructions

- // weren't visited because it's not part of the the loop.

- BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();

- assert(LoopExitBB && "Loops with multiple exits are not supported.");

- VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];

- // Loop exit was already set as successor of the loop exiting BB.

- // We only set its predecessor VPBB now.

- setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);

- // 3. Fix up region blocks for loops. For each loop,

- // * use the header block as entry to the corresponding region,

- // * use the latch block as exit of the corresponding region,

- // * set the region as successor of the loop pre-header, and

- // * set the exit block as successor to the region.

- SmallVector<Loop *> LoopWorkList;

- LoopWorkList.push_back(TheLoop);

- while (!LoopWorkList.empty()) {

- Loop *L = LoopWorkList.pop_back_val();

- BasicBlock *Header = L->getHeader();

- BasicBlock *Exiting = L->getLoopLatch();

- assert(Exiting == L->getExitingBlock() &&

- "Latch must be the only exiting block");

- VPRegionBlock *Region = Loop2Region[L];

- VPBasicBlock *HeaderVPBB = getOrCreateVPBB(Header);

- VPBasicBlock *ExitingVPBB = getOrCreateVPBB(Exiting);

- // Disconnect backedge and pre-header from header.

- VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(L->getLoopPreheader());

- VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB);

- VPBlockUtils::disconnectBlocks(ExitingVPBB, HeaderVPBB);

- Region->setParent(PreheaderVPBB->getParent());

- Region->setEntry(HeaderVPBB);

- VPBlockUtils::connectBlocks(PreheaderVPBB, Region);

- // Disconnect exit block from exiting (=latch) block, set exiting block and

- // connect region to exit block.

- VPBasicBlock *ExitVPBB = getOrCreateVPBB(L->getExitBlock());

- VPBlockUtils::disconnectBlocks(ExitingVPBB, ExitVPBB);

- Region->setExiting(ExitingVPBB);

- VPBlockUtils::connectBlocks(Region, ExitVPBB);

- // Queue sub-loops for processing.

- LoopWorkList.append(L->begin(), L->end());

- }

- // 4. The whole CFG has been built at this point so all the input Values must

+ // 2. The whole CFG has been built at this point so all the input Values must

// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding

// VPlan operands.

fixPhiNodes();

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 26c309eed800..02e400d590be 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -12,6 +12,7 @@

//===----------------------------------------------------------------------===//

#include "VPlan.h"

+#include "VPlanAnalysis.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Twine.h"

@@ -43,6 +44,8 @@ extern cl::opt<bool> EnableVPlanNativePath;

bool VPRecipeBase::mayWriteToMemory() const {

switch (getVPDefID()) {

+ case VPInterleaveSC:

+ return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;

case VPWidenMemoryInstructionSC: {

return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();

}

@@ -114,6 +117,16 @@ bool VPRecipeBase::mayHaveSideEffects() const {

case VPDerivedIVSC:

case VPPredInstPHISC:

return false;

+ case VPInstructionSC:

+ switch (cast<VPInstruction>(this)->getOpcode()) {

+ case Instruction::ICmp:

+ case VPInstruction::Not:

+ case VPInstruction::CalculateTripCountMinusVF:

+ case VPInstruction::CanonicalIVIncrementForPart:

+ return false;

+ default:

+ return true;

+ }

case VPWidenCallSC:

return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())

->mayHaveSideEffects();

@@ -135,6 +148,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {

"underlying instruction has side-effects");

return false;

}

+ case VPInterleaveSC:

+ return mayWriteToMemory();

case VPWidenMemoryInstructionSC:

assert(cast<VPWidenMemoryInstructionRecipe>(this)

->getIngredient()

@@ -156,8 +171,13 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {

VPValue *ExitValue = getOperand(0);

if (vputils::isUniformAfterVectorization(ExitValue))

Lane = VPLane::getFirstLane();

+ VPBasicBlock *MiddleVPBB =

+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());

+ assert(MiddleVPBB->getNumSuccessors() == 0 &&

+ "the middle block must not have any successors");

+ BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];

Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),

- State.Builder.GetInsertBlock());

+ MiddleBB);

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@@ -216,15 +236,55 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,

insertBefore(BB, I);

}

+FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {

+ assert(OpType == OperationType::FPMathOp &&

+ "recipe doesn't have fast math flags");

+ FastMathFlags Res;

+ Res.setAllowReassoc(FMFs.AllowReassoc);

+ Res.setNoNaNs(FMFs.NoNaNs);

+ Res.setNoInfs(FMFs.NoInfs);

+ Res.setNoSignedZeros(FMFs.NoSignedZeros);

+ Res.setAllowReciprocal(FMFs.AllowReciprocal);

+ Res.setAllowContract(FMFs.AllowContract);

+ Res.setApproxFunc(FMFs.ApproxFunc);

+ return Res;

+VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,

+ VPValue *A, VPValue *B, DebugLoc DL,

+ const Twine &Name)

+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),

+ Pred, DL),

+ VPValue(this), Opcode(Opcode), Name(Name.str()) {

+ assert(Opcode == Instruction::ICmp &&

+ "only ICmp predicates supported at the moment");

+VPInstruction::VPInstruction(unsigned Opcode,

+ std::initializer_list<VPValue *> Operands,

+ FastMathFlags FMFs, DebugLoc DL, const Twine &Name)

+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),

+ VPValue(this), Opcode(Opcode), Name(Name.str()) {

+ // Make sure the VPInstruction is a floating-point operation.

+ assert(isFPMathOp() && "this op can't take fast-math flags");

Value *VPInstruction::generateInstruction(VPTransformState &State,

unsigned Part) {

IRBuilderBase &Builder = State.Builder;

- Builder.SetCurrentDebugLocation(DL);

+ Builder.SetCurrentDebugLocation(getDebugLoc());

if (Instruction::isBinaryOp(getOpcode())) {

+ if (Part != 0 && vputils::onlyFirstPartUsed(this))

+ return State.get(this, 0);

Value *A = State.get(getOperand(0), Part);

Value *B = State.get(getOperand(1), Part);

- return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);

+ auto *Res =

+ Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);

+ if (auto *I = dyn_cast<Instruction>(Res))

+ setFlags(I);

+ return Res;

}

switch (getOpcode()) {

@@ -232,10 +292,10 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,

Value *A = State.get(getOperand(0), Part);

return Builder.CreateNot(A, Name);

}

- case VPInstruction::ICmpULE: {

- Value *IV = State.get(getOperand(0), Part);

- Value *TC = State.get(getOperand(1), Part);

- return Builder.CreateICmpULE(IV, TC, Name);

+ case Instruction::ICmp: {

+ Value *A = State.get(getOperand(0), Part);

+ Value *B = State.get(getOperand(1), Part);

+ return Builder.CreateCmp(getPredicate(), A, B, Name);

}

case Instruction::Select: {

Value *Cond = State.get(getOperand(0), Part);

@@ -285,23 +345,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,

Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);

return Builder.CreateSelect(Cmp, Sub, Zero);

}

- case VPInstruction::CanonicalIVIncrement:

- case VPInstruction::CanonicalIVIncrementNUW: {

- if (Part == 0) {

- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;

- auto *Phi = State.get(getOperand(0), 0);

- // The loop step is equal to the vectorization factor (num of SIMD

- // elements) times the unroll factor (num of SIMD instructions).

- Value *Step =

- createStepForVF(Builder, Phi->getType(), State.VF, State.UF);

- return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);

- }

- return State.get(this, 0);

- }

- case VPInstruction::CanonicalIVIncrementForPart:

- case VPInstruction::CanonicalIVIncrementForPartNUW: {

- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;

+ case VPInstruction::CanonicalIVIncrementForPart: {

auto *IV = State.get(getOperand(0), VPIteration(0, 0));

if (Part == 0)

return IV;

@@ -309,7 +353,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,

// The canonical IV is incremented by the vectorization factor (num of SIMD

// elements) times the unroll part.

Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);

- return Builder.CreateAdd(IV, Step, Name, IsNUW, false);

+ return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),

+ hasNoSignedWrap());

}

case VPInstruction::BranchOnCond: {

if (Part != 0)

@@ -361,10 +406,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,

}

+#if !defined(NDEBUG)

+bool VPInstruction::isFPMathOp() const {

+ // Inspired by FPMathOperator::classof. Notable differences are that we don't

+ // support Call, PHI and Select opcodes here yet.

+ return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||

+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||

+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||

+ Opcode == Instruction::FCmp || Opcode == Instruction::Select;

+#endif

void VPInstruction::execute(VPTransformState &State) {

assert(!State.Instance && "VPInstruction executing an Instance");

IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);

- State.Builder.setFastMathFlags(FMF);

+ assert((hasFastMathFlags() == isFPMathOp() ||

+ getOpcode() == Instruction::Select) &&

+ "Recipe not a FPMathOp but has fast-math flags?");

+ if (hasFastMathFlags())

+ State.Builder.setFastMathFlags(getFastMathFlags());

for (unsigned Part = 0; Part < State.UF; ++Part) {

Value *GeneratedValue = generateInstruction(State, Part);

if (!hasResult())

@@ -393,9 +453,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

case VPInstruction::Not:

O << "not";

break;

- case VPInstruction::ICmpULE:

- O << "icmp ule";

- break;

case VPInstruction::SLPLoad:

O << "combined load";

break;

@@ -408,12 +465,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

case VPInstruction::FirstOrderRecurrenceSplice:

O << "first-order splice";

break;

- case VPInstruction::CanonicalIVIncrement:

- O << "VF * UF + ";

- break;

- case VPInstruction::CanonicalIVIncrementNUW:

- O << "VF * UF +(nuw) ";

- break;

case VPInstruction::BranchOnCond:

O << "branch-on-cond";

break;

@@ -421,49 +472,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

O << "TC > VF ? TC - VF : 0";

break;

case VPInstruction::CanonicalIVIncrementForPart:

- O << "VF * Part + ";

- break;

- case VPInstruction::CanonicalIVIncrementForPartNUW:

- O << "VF * Part +(nuw) ";

+ O << "VF * Part +";

break;

case VPInstruction::BranchOnCount:

- O << "branch-on-count ";

+ O << "branch-on-count";

break;

default:

O << Instruction::getOpcodeName(getOpcode());

}

- O << FMF;

- for (const VPValue *Operand : operands()) {

- O << " ";

- Operand->printAsOperand(O, SlotTracker);

- }

+ printFlags(O);

+ printOperands(O, SlotTracker);

- if (DL) {

+ if (auto DL = getDebugLoc()) {

O << ", !dbg ";

DL.print(O);

}

#endif

-void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {

- // Make sure the VPInstruction is a floating-point operation.

- assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||

- Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||

- Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||

- Opcode == Instruction::FCmp) &&

- "this op can't take fast-math flags");

- FMF = FMFNew;

void VPWidenCallRecipe::execute(VPTransformState &State) {

assert(State.VF.isVector() && "not widening");

auto &CI = *cast<CallInst>(getUnderlyingInstr());

assert(!isa<DbgInfoIntrinsic>(CI) &&

"DbgInfoIntrinsic should have been dropped during VPlan construction");

- State.setDebugLocFromInst(&CI);

+ State.setDebugLocFrom(CI.getDebugLoc());

+ FunctionType *VFTy = nullptr;

+ if (Variant)

+ VFTy = Variant->getFunctionType();

for (unsigned Part = 0; Part < State.UF; ++Part) {

SmallVector<Type *, 2> TysForDecl;

// Add return type if intrinsic is overloaded on it.

@@ -475,12 +512,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {

for (const auto &I : enumerate(operands())) {

// Some intrinsics have a scalar argument - don't replace it with a

// vector.

+ // Some vectorized function variants may also take a scalar argument,

+ // e.g. linear parameters for pointers.

Value *Arg;

- if (VectorIntrinsicID == Intrinsic::not_intrinsic ||

- !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))

- Arg = State.get(I.value(), Part);

- else

+ if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||

+ (VectorIntrinsicID != Intrinsic::not_intrinsic &&

+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))

Arg = State.get(I.value(), VPIteration(0, 0));

+ else

+ Arg = State.get(I.value(), Part);

if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))

TysForDecl.push_back(Arg->getType());

Args.push_back(Arg);

@@ -553,8 +593,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,

#endif

void VPWidenSelectRecipe::execute(VPTransformState &State) {

- auto &I = *cast<SelectInst>(getUnderlyingInstr());

- State.setDebugLocFromInst(&I);

+ State.setDebugLocFrom(getDebugLoc());

// The condition can be loop invariant but still defined inside the

// loop. This means that we can't just use the original 'cond' value.

@@ -569,13 +608,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {

Value *Op1 = State.get(getOperand(2), Part);

Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);

State.set(this, Sel, Part);

- State.addMetadata(Sel, &I);

+ State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));

}

+VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(

+ const FastMathFlags &FMF) {

+ AllowReassoc = FMF.allowReassoc();

+ NoNaNs = FMF.noNaNs();

+ NoInfs = FMF.noInfs();

+ NoSignedZeros = FMF.noSignedZeros();

+ AllowReciprocal = FMF.allowReciprocal();

+ AllowContract = FMF.allowContract();

+ ApproxFunc = FMF.approxFunc();

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {

switch (OpType) {

+ case OperationType::Cmp:

+ O << " " << CmpInst::getPredicateName(getPredicate());

+ break;

+ case OperationType::DisjointOp:

+ if (DisjointFlags.IsDisjoint)

+ O << " disjoint";

+ break;

case OperationType::PossiblyExactOp:

if (ExactFlags.IsExact)

O << " exact";

@@ -593,17 +650,22 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {

if (GEPFlags.IsInBounds)

O << " inbounds";

break;

+ case OperationType::NonNegOp:

+ if (NonNegFlags.NonNeg)

+ O << " nneg";

+ break;

case OperationType::Other:

break;

}

- O << " ";

+ if (getNumOperands() > 0)

+ O << " ";

}

#endif

void VPWidenRecipe::execute(VPTransformState &State) {

- auto &I = *cast<Instruction>(getUnderlyingValue());

+ State.setDebugLocFrom(getDebugLoc());

auto &Builder = State.Builder;

- switch (I.getOpcode()) {

+ switch (Opcode) {

case Instruction::Call:

case Instruction::Br:

case Instruction::PHI:

@@ -630,28 +692,24 @@ void VPWidenRecipe::execute(VPTransformState &State) {

case Instruction::Or:

case Instruction::Xor: {

// Just widen unops and binops.

- State.setDebugLocFromInst(&I);

for (unsigned Part = 0; Part < State.UF; ++Part) {

SmallVector<Value *, 2> Ops;

for (VPValue *VPOp : operands())

Ops.push_back(State.get(VPOp, Part));

- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

+ Value *V = Builder.CreateNAryOp(Opcode, Ops);

if (auto *VecOp = dyn_cast<Instruction>(V))

setFlags(VecOp);

// Use this vector value for all users of the original instruction.

State.set(this, V, Part);

- State.addMetadata(V, &I);

+ State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));

}

break;

}

case Instruction::Freeze: {

- State.setDebugLocFromInst(&I);

for (unsigned Part = 0; Part < State.UF; ++Part) {

Value *Op = State.get(getOperand(0), Part);

@@ -663,9 +721,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {

case Instruction::ICmp:

case Instruction::FCmp: {

// Widen compares. Generate vector compares.

- bool FCmp = (I.getOpcode() == Instruction::FCmp);

- auto *Cmp = cast<CmpInst>(&I);

- State.setDebugLocFromInst(Cmp);

+ bool FCmp = Opcode == Instruction::FCmp;

for (unsigned Part = 0; Part < State.UF; ++Part) {

Value *A = State.get(getOperand(0), Part);

Value *B = State.get(getOperand(1), Part);

@@ -673,51 +729,64 @@ void VPWidenRecipe::execute(VPTransformState &State) {

if (FCmp) {

// Propagate fast math flags.

IRBuilder<>::FastMathFlagGuard FMFG(Builder);

- Builder.setFastMathFlags(Cmp->getFastMathFlags());

- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);

+ if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))

+ Builder.setFastMathFlags(I->getFastMathFlags());

+ C = Builder.CreateFCmp(getPredicate(), A, B);

} else {

- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);

+ C = Builder.CreateICmp(getPredicate(), A, B);

}

State.set(this, C, Part);

- State.addMetadata(C, &I);

+ State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));

}

break;

}

default:

// This instruction is not vectorized by simple widening.

- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);

+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "

+ << Instruction::getOpcodeName(Opcode));

llvm_unreachable("Unhandled instruction!");

} // end of switch.

+#if !defined(NDEBUG)

+ // Verify that VPlan type inference results agree with the type of the

+ // generated values.

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),

+ State.VF) == State.get(this, Part)->getType() &&

+ "inferred type and type from generated instructions do not match");

+ }

+#endif

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "WIDEN ";

printAsOperand(O, SlotTracker);

- const Instruction *UI = getUnderlyingInstr();

- O << " = " << UI->getOpcodeName();

+ O << " = " << Instruction::getOpcodeName(Opcode);

printFlags(O);

- if (auto *Cmp = dyn_cast<CmpInst>(UI))

- O << Cmp->getPredicate() << " ";

printOperands(O, SlotTracker);

}

#endif

void VPWidenCastRecipe::execute(VPTransformState &State) {

- auto *I = cast_or_null<Instruction>(getUnderlyingValue());

- if (I)

- State.setDebugLocFromInst(I);

+ State.setDebugLocFrom(getDebugLoc());

auto &Builder = State.Builder;

/// Vectorize casts.

assert(State.VF.isVector() && "Not vectorizing?");

Type *DestTy = VectorType::get(getResultType(), State.VF);

+ VPValue *Op = getOperand(0);

for (unsigned Part = 0; Part < State.UF; ++Part) {

- Value *A = State.get(getOperand(0), Part);

+ if (Part > 0 && Op->isLiveIn()) {

+ // FIXME: Remove once explicit unrolling is implemented using VPlan.

+ State.set(this, State.get(this, 0), Part);

+ continue;

+ }

+ Value *A = State.get(Op, Part);

Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);

State.set(this, Cast, Part);

- State.addMetadata(Cast, I);

+ State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));

}

@@ -727,10 +796,182 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,

O << Indent << "WIDEN-CAST ";

printAsOperand(O, SlotTracker);

O << " = " << Instruction::getOpcodeName(Opcode) << " ";

+ printFlags(O);

printOperands(O, SlotTracker);

O << " to " << *getResultType();

}

+#endif

+/// This function adds

+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)

+/// to each vector element of Val. The sequence starts at StartIndex.

+/// \p Opcode is relevant for FP induction variable.

+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,

+ Instruction::BinaryOps BinOp, ElementCount VF,

+ IRBuilderBase &Builder) {

+ assert(VF.isVector() && "only vector VFs are supported");

+ // Create and check the types.

+ auto *ValVTy = cast<VectorType>(Val->getType());

+ ElementCount VLen = ValVTy->getElementCount();

+ Type *STy = Val->getType()->getScalarType();

+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&

+ "Induction Step must be an integer or FP");

+ assert(Step->getType() == STy && "Step has wrong type");

+ SmallVector<Constant *, 8> Indices;

+ // Create a vector of consecutive numbers from zero to VF.

+ VectorType *InitVecValVTy = ValVTy;

+ if (STy->isFloatingPointTy()) {

+ Type *InitVecValSTy =

+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());

+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);

+ }

+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);

+ // Splat the StartIdx

+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);

+ if (STy->isIntegerTy()) {

+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);

+ Step = Builder.CreateVectorSplat(VLen, Step);

+ assert(Step->getType() == Val->getType() && "Invalid step vec");

+ // FIXME: The newly created binary instructions should contain nsw/nuw

+ // flags, which can be found from the original scalar operations.

+ Step = Builder.CreateMul(InitVec, Step);

+ return Builder.CreateAdd(Val, Step, "induction");

+ }

+ // Floating point induction.

+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&

+ "Binary Opcode should be specified for FP induction");

+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);

+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);

+ Step = Builder.CreateVectorSplat(VLen, Step);

+ Value *MulOp = Builder.CreateFMul(InitVec, Step);

+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");

+/// A helper function that returns an integer or floating-point constant with

+/// value C.

+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {

+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)

+ : ConstantFP::get(Ty, C);

+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,

+ ElementCount VF) {

+ assert(FTy->isFloatingPointTy() && "Expected floating point type!");

+ Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());

+ Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);

+ return B.CreateUIToFP(RuntimeVF, FTy);

+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

+ assert(!State.Instance && "Int or FP induction being replicated.");

+ Value *Start = getStartValue()->getLiveInIRValue();

+ const InductionDescriptor &ID = getInductionDescriptor();

+ TruncInst *Trunc = getTruncInst();

+ IRBuilderBase &Builder = State.Builder;

+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");

+ assert(State.VF.isVector() && "must have vector VF");

+ // The value from the original loop to which we are mapping the new induction

+ // variable.

+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

+ // Fast-math-flags propagate from the original induction instruction.

+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);

+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))

+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());

+ // Now do the actual transformations, and start with fetching the step value.

+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));

+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&

+ "Expected either an induction phi-node or a truncate of it!");

+ // Construct the initial value of the vector IV in the vector loop preheader

+ auto CurrIP = Builder.saveIP();

+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

+ Builder.SetInsertPoint(VectorPH->getTerminator());

+ if (isa<TruncInst>(EntryVal)) {

+ assert(Start->getType()->isIntegerTy() &&

+ "Truncation requires an integer type");

+ auto *TruncType = cast<IntegerType>(EntryVal->getType());

+ Step = Builder.CreateTrunc(Step, TruncType);

+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);

+ }

+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);

+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);

+ Value *SteppedStart = getStepVector(

+ SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);

+ // We create vector phi nodes for both integer and floating-point induction

+ // variables. Here, we determine the kind of arithmetic we will perform.

+ Instruction::BinaryOps AddOp;

+ Instruction::BinaryOps MulOp;

+ if (Step->getType()->isIntegerTy()) {

+ AddOp = Instruction::Add;

+ MulOp = Instruction::Mul;

+ } else {

+ AddOp = ID.getInductionOpcode();

+ MulOp = Instruction::FMul;

+ }

+ // Multiply the vectorization factor by the step using integer or

+ // floating-point arithmetic as appropriate.

+ Type *StepType = Step->getType();

+ Value *RuntimeVF;

+ if (Step->getType()->isFloatingPointTy())

+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);

+ else

+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);

+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);

+ // Create a vector splat to use in the induction update.

+ //

+ // FIXME: If the step is non-constant, we create the vector splat with

+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't

+ // handle a constant vector splat.

+ Value *SplatVF = isa<Constant>(Mul)

+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))

+ : Builder.CreateVectorSplat(State.VF, Mul);

+ Builder.restoreIP(CurrIP);

+ // We may need to add the step a number of times, depending on the unroll

+ // factor. The last of those goes into the PHI.

+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");

+ VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());

+ VecInd->setDebugLoc(EntryVal->getDebugLoc());

+ Instruction *LastInduction = VecInd;

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ State.set(this, LastInduction, Part);

+ if (isa<TruncInst>(EntryVal))

+ State.addMetadata(LastInduction, EntryVal);

+ LastInduction = cast<Instruction>(

+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));

+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());

+ }

+ LastInduction->setName("vec.ind.next");

+ VecInd->addIncoming(SteppedStart, VectorPH);

+ // Add induction update using an incorrect block temporarily. The phi node

+ // will be fixed after VPlan execution. Note that at this point the latch

+ // block cannot be used, as it does not exist yet.

+ // TODO: Model increment value in VPlan, by turning the recipe into a

+ // multi-def and a subclass of VPHeaderPHIRecipe.

+ VecInd->addIncoming(LastInduction, VectorPH);

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "WIDEN-INDUCTION";

@@ -770,17 +1011,112 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,

O << " * ";

getStepValue()->printAsOperand(O, SlotTracker);

- if (IndDesc.getStep()->getType() != ResultTy)

- O << " (truncated to " << *ResultTy << ")";

+ if (TruncResultTy)

+ O << " (truncated to " << *TruncResultTy << ")";

}

#endif

+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {

+ // Fast-math-flags propagate from the original induction instruction.

+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

+ if (hasFastMathFlags())

+ State.Builder.setFastMathFlags(getFastMathFlags());

+ /// Compute scalar induction steps. \p ScalarIV is the scalar induction

+ /// variable on which to base the steps, \p Step is the size of the step.

+ Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));

+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));

+ IRBuilderBase &Builder = State.Builder;

+ // Ensure step has the same type as that of scalar IV.

+ Type *BaseIVTy = BaseIV->getType()->getScalarType();

+ if (BaseIVTy != Step->getType()) {

+ // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to

+ // avoid separate truncate here.

+ assert(Step->getType()->isIntegerTy() &&

+ "Truncation requires an integer step");

+ Step = State.Builder.CreateTrunc(Step, BaseIVTy);

+ }

+ // We build scalar steps for both integer and floating-point induction

+ // variables. Here, we determine the kind of arithmetic we will perform.

+ Instruction::BinaryOps AddOp;

+ Instruction::BinaryOps MulOp;

+ if (BaseIVTy->isIntegerTy()) {

+ AddOp = Instruction::Add;

+ MulOp = Instruction::Mul;

+ } else {

+ AddOp = InductionOpcode;

+ MulOp = Instruction::FMul;

+ }

+ // Determine the number of scalars we need to generate for each unroll

+ // iteration.

+ bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);

+ // Compute the scalar steps and save the results in State.

+ Type *IntStepTy =

+ IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());

+ Type *VecIVTy = nullptr;

+ Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;

+ if (!FirstLaneOnly && State.VF.isScalable()) {

+ VecIVTy = VectorType::get(BaseIVTy, State.VF);

+ UnitStepVec =

+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));

+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);

+ SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);

+ }

+ unsigned StartPart = 0;

+ unsigned EndPart = State.UF;

+ unsigned StartLane = 0;

+ unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();

+ if (State.Instance) {

+ StartPart = State.Instance->Part;

+ EndPart = StartPart + 1;

+ StartLane = State.Instance->Lane.getKnownLane();

+ EndLane = StartLane + 1;

+ }

+ for (unsigned Part = StartPart; Part < EndPart; ++Part) {

+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);

+ if (!FirstLaneOnly && State.VF.isScalable()) {

+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);

+ auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);

+ if (BaseIVTy->isFloatingPointTy())

+ InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);

+ auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);

+ auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);

+ State.set(this, Add, Part);

+ // It's useful to record the lane values too for the known minimum number

+ // of elements so we do those below. This improves the code quality when

+ // trying to extract the first element, for example.

+ }

+ if (BaseIVTy->isFloatingPointTy())

+ StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);

+ for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {

+ Value *StartIdx = Builder.CreateBinOp(

+ AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));

+ // The step returned by `createStepForVF` is a runtime-evaluated value

+ // when VF is scalable. Otherwise, it should be folded into a Constant.

+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&

+ "Expected StartIdx to be folded to a constant when VF is not "

+ "scalable");

+ auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);

+ auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);

+ State.set(this, Add, VPIteration(Part, Lane));

+ }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent;

printAsOperand(O, SlotTracker);

- O << Indent << "= SCALAR-STEPS ";

+ O << " = SCALAR-STEPS ";

printOperands(O, SlotTracker);

}

#endif

@@ -874,7 +1210,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,

#endif

void VPBlendRecipe::execute(VPTransformState &State) {

- State.setDebugLocFromInst(Phi);

+ State.setDebugLocFrom(getDebugLoc());

// We know that all PHIs in non-header blocks are converted into

// selects, so we don't have to worry about the insertion order and we

// can just use the builder.

@@ -916,7 +1252,7 @@ void VPBlendRecipe::execute(VPTransformState &State) {

void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "BLEND ";

- Phi->printAsOperand(O, false);

+ printAsOperand(O, SlotTracker);

O << " =";

if (getNumIncomingValues() == 1) {

// Not a User of any mask: not really blending, this is a

@@ -942,14 +1278,14 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,

O << " +";

if (isa<FPMathOperator>(getUnderlyingInstr()))

O << getUnderlyingInstr()->getFastMathFlags();

- O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";

+ O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";

getVecOp()->printAsOperand(O, SlotTracker);

if (getCondOp()) {

O << ", ";

getCondOp()->printAsOperand(O, SlotTracker);

}

O << ")";

- if (RdxDesc->IntermediateStore)

+ if (RdxDesc.IntermediateStore)

O << " (with final reduction value stored in invariant address sank "

"outside of loop)";

}

@@ -1093,12 +1429,12 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,

void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {

Value *Start = getStartValue()->getLiveInIRValue();

- PHINode *EntryPart = PHINode::Create(

- Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());

+ PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");

+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());

BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);

EntryPart->addIncoming(Start, VectorPH);

- EntryPart->setDebugLoc(DL);

+ EntryPart->setDebugLoc(getDebugLoc());

for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)

State.set(this, EntryPart, Part);

}

@@ -1108,7 +1444,8 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,

VPSlotTracker &SlotTracker) const {

O << Indent << "EMIT ";

printAsOperand(O, SlotTracker);

- O << " = CANONICAL-INDUCTION";

+ O << " = CANONICAL-INDUCTION ";

+ printOperands(O, SlotTracker);

}

#endif

@@ -1221,8 +1558,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {

}

// Create a phi node for the new recurrence.

- PHINode *EntryPart = PHINode::Create(

- VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());

+ PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");

+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());

EntryPart->addIncoming(VectorInit, VectorPH);

State.set(this, EntryPart, 0);

}

@@ -1254,8 +1591,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {

"recipe must be in the vector loop header");

unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;

for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {

- Value *EntryPart =

- PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());

+ Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");

+ EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());

State.set(this, EntryPart, Part);

}

@@ -1269,8 +1606,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {

Value *Iden = nullptr;

RecurKind RK = RdxDesc.getRecurrenceKind();

if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||

- RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {

- // MinMax reduction have the start value as their identify.

+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {

+ // MinMax and AnyOf reductions have the start value as their identity.

if (ScalarPHI) {

Iden = StartV;

} else {

@@ -1316,23 +1653,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {

assert(EnableVPlanNativePath &&

"Non-native vplans are not expected to have VPWidenPHIRecipes.");

- // Currently we enter here in the VPlan-native path for non-induction

- // PHIs where all control flow is uniform. We simply widen these PHIs.

- // Create a vector phi with no operands - the vector phi operands will be

- // set at the end of vector code generation.

- VPBasicBlock *Parent = getParent();

- VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();

- unsigned StartIdx = 0;

- // For phis in header blocks of loop regions, use the index of the value

- // coming from the preheader.

- if (LoopRegion->getEntryBasicBlock() == Parent) {

- for (unsigned I = 0; I < getNumOperands(); ++I) {

- if (getIncomingBlock(I) ==

- LoopRegion->getSinglePredecessor()->getExitingBasicBlock())

- StartIdx = I;

- }

- Value *Op0 = State.get(getOperand(StartIdx), 0);

+ Value *Op0 = State.get(getOperand(0), 0);

Type *VecTy = Op0->getType();

Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");

State.set(this, VecPhi, 0);

@@ -1368,7 +1689,7 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {

PHINode *EntryPart =

State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");

EntryPart->addIncoming(StartMask, VectorPH);

- EntryPart->setDebugLoc(DL);

+ EntryPart->setDebugLoc(getDebugLoc());

State.set(this, EntryPart, Part);

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 83bfdfd09d19..33132880d5a4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

@@ -12,17 +12,22 @@

//===----------------------------------------------------------------------===//

#include "VPlanTransforms.h"

-#include "VPlanDominatorTree.h"

#include "VPRecipeBuilder.h"

+#include "VPlanAnalysis.h"

#include "VPlanCFG.h"

+#include "VPlanDominatorTree.h"

#include "llvm/ADT/PostOrderIterator.h"

+#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/Analysis/IVDescriptors.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Intrinsics.h"

+#include "llvm/IR/PatternMatch.h"

using namespace llvm;

+using namespace llvm::PatternMatch;

void VPlanTransforms::VPInstructionsToVPRecipes(

VPlanPtr &Plan,

function_ref<const InductionDescriptor *(PHINode *)>

@@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes(

NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());

} else if (auto *CI = dyn_cast<CastInst>(Inst)) {

NewRecipe = new VPWidenCastRecipe(

- CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI);

+ CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);

} else {

NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());

}

@@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) {

// TODO: add ".cloned" suffix to name of Clone's VPValue.

Clone->insertBefore(SinkCandidate);

- for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) {

- auto *UI = cast<VPRecipeBase>(U);

- if (UI->getParent() == SinkTo)

- continue;

- for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) {

- if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue())

- continue;

- UI->setOperand(Idx, Clone);

- }

+ SinkCandidate->getVPSingleValue()->replaceUsesWithIf(

+ Clone, [SinkTo](VPUser &U, unsigned) {

+ return cast<VPRecipeBase>(&U)->getParent() != SinkTo;

+ });

}

SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());

for (VPValue *Op : SinkCandidate->operands())

@@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {

VPValue *PredInst1 =

cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);

VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();

- for (VPUser *U : to_vector(Phi1ToMoveV->users())) {

- auto *UI = dyn_cast<VPRecipeBase>(U);

- if (!UI || UI->getParent() != Then2)

- continue;

- for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {

- if (Phi1ToMoveV != U->getOperand(I))

- continue;

- U->setOperand(I, PredInst1);

- }

+ Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {

+ auto *UI = dyn_cast<VPRecipeBase>(&U);

+ return UI && UI->getParent() == Then2;

+ });

Phi1ToMove.moveBefore(*Merge2, Merge2->begin());

}

@@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {

// The recipes in the block are processed in reverse order, to catch chains

// of dead recipes.

for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

- if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {

- return V->getNumUsers() > 0;

- }))

+ // A user keeps R alive:

+ if (any_of(R.definedValues(),

+ [](VPValue *V) { return V->getNumUsers(); }))

+ continue;

+ // Having side effects keeps R alive, but do remove conditional assume

+ // instructions as their conditions may be flattened.

+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

+ bool IsConditionalAssume =

+ RepR && RepR->isPredicated() &&

+ match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());

+ if (R.mayHaveSideEffects() && !IsConditionalAssume)

continue;

R.eraseFromParent();

}

+static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,

+ ScalarEvolution &SE, Instruction *TruncI,

+ Type *IVTy, VPValue *StartV,

+ VPValue *Step) {

+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

+ auto IP = HeaderVPBB->getFirstNonPhi();

+ VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();

+ Type *TruncTy = TruncI ? TruncI->getType() : IVTy;

+ VPValue *BaseIV = CanonicalIV;

+ if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) {

+ BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step,

+ TruncI ? TruncI->getType() : nullptr);

+ HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);

+ }

+ VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);

+ HeaderVPBB->insert(Steps, IP);

+ return Steps;

void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {

SmallVector<VPRecipeBase *> ToRemove;

VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

@@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {

}))

continue;

- auto IP = HeaderVPBB->getFirstNonPhi();

- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();

- Type *ResultTy = WideIV->getPHINode()->getType();

- if (Instruction *TruncI = WideIV->getTruncInst())

- ResultTy = TruncI->getType();

const InductionDescriptor &ID = WideIV->getInductionDescriptor();

- VPValue *Step = WideIV->getStepValue();

- VPValue *BaseIV = CanonicalIV;

- if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step,

- ResultTy)) {

- BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV,

- Step, ResultTy);

- HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);

- }

- VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);

- HeaderVPBB->insert(Steps, IP);

+ VPValue *Steps = createScalarIVSteps(

+ Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(),

+ WideIV->getStartValue(), WideIV->getStepValue());

- // Update scalar users of IV to use Step instead. Use SetVector to ensure

- // the list of users doesn't contain duplicates.

- SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end());

- for (VPUser *U : Users) {

- if (HasOnlyVectorVFs && !U->usesScalars(WideIV))

- continue;

- for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) {

- if (U->getOperand(I) != WideIV)

- continue;

- U->setOperand(I, Steps);

- }

+ // Update scalar users of IV to use Step instead.

+ if (!HasOnlyVectorVFs)

+ WideIV->replaceAllUsesWith(Steps);

+ else

+ WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {

+ return U.usesScalars(WideIV);

+ });

}

@@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {

}

+/// Returns true is \p V is constant one.

+static bool isConstantOne(VPValue *V) {

+ if (!V->isLiveIn())

+ return false;

+ auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());

+ return C && C->isOne();

+/// Returns the llvm::Instruction opcode for \p R.

+static unsigned getOpcodeForRecipe(VPRecipeBase &R) {

+ if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))

+ return WidenR->getUnderlyingInstr()->getOpcode();

+ if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))

+ return WidenC->getOpcode();

+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))

+ return RepR->getUnderlyingInstr()->getOpcode();

+ if (auto *VPI = dyn_cast<VPInstruction>(&R))

+ return VPI->getOpcode();

+ return 0;

+/// Try to simplify recipe \p R.

+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {

+ switch (getOpcodeForRecipe(R)) {

+ case Instruction::Mul: {

+ VPValue *A = R.getOperand(0);

+ VPValue *B = R.getOperand(1);

+ if (isConstantOne(A))

+ return R.getVPSingleValue()->replaceAllUsesWith(B);

+ if (isConstantOne(B))

+ return R.getVPSingleValue()->replaceAllUsesWith(A);

+ break;

+ }

+ case Instruction::Trunc: {

+ VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();

+ if (!Ext)

+ break;

+ unsigned ExtOpcode = getOpcodeForRecipe(*Ext);

+ if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)

+ break;

+ VPValue *A = Ext->getOperand(0);

+ VPValue *Trunc = R.getVPSingleValue();

+ Type *TruncTy = TypeInfo.inferScalarType(Trunc);

+ Type *ATy = TypeInfo.inferScalarType(A);

+ if (TruncTy == ATy) {

+ Trunc->replaceAllUsesWith(A);

+ } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {

+ auto *VPC =

+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);

+ VPC->insertBefore(&R);

+ Trunc->replaceAllUsesWith(VPC);

+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {

+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);

+ VPC->insertBefore(&R);

+ Trunc->replaceAllUsesWith(VPC);

+ }

+#ifndef NDEBUG

+ // Verify that the cached type info is for both A and its users is still

+ // accurate by comparing it to freshly computed types.

+ VPTypeAnalysis TypeInfo2(TypeInfo.getContext());

+ assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));

+ for (VPUser *U : A->users()) {

+ auto *R = dyn_cast<VPRecipeBase>(U);

+ if (!R)

+ continue;

+ for (VPValue *VPV : R->definedValues())

+ assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));

+ }

+#endif

+ break;

+ }

+ default:

+ break;

+ }

+/// Try to simplify the recipes in \p Plan.

+static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {

+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

+ Plan.getEntry());

+ VPTypeAnalysis TypeInfo(Ctx);

+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

+ simplifyRecipe(R, TypeInfo);

+ }

+void VPlanTransforms::truncateToMinimalBitwidths(

+ VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,

+ LLVMContext &Ctx) {

+#ifndef NDEBUG

+ // Count the processed recipes and cross check the count later with MinBWs

+ // size, to make sure all entries in MinBWs have been handled.

+ unsigned NumProcessedRecipes = 0;

+#endif

+ // Keep track of created truncates, so they can be re-used. Note that we

+ // cannot use RAUW after creating a new truncate, as this would could make

+ // other uses have different types for their operands, making them invalidly

+ // typed.

+ DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;

+ VPTypeAnalysis TypeInfo(Ctx);

+ VPBasicBlock *PH = Plan.getEntry();

+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {

+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,

+ VPWidenSelectRecipe>(&R))

+ continue;

+ VPValue *ResultVPV = R.getVPSingleValue();

+ auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());

+ unsigned NewResSizeInBits = MinBWs.lookup(UI);

+ if (!NewResSizeInBits)

+ continue;

+#ifndef NDEBUG

+ NumProcessedRecipes++;

+#endif

+ // If the value wasn't vectorized, we must maintain the original scalar

+ // type. Skip those here, after incrementing NumProcessedRecipes. Also

+ // skip casts which do not need to be handled explicitly here, as

+ // redundant casts will be removed during recipe simplification.

+ if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {

+#ifndef NDEBUG

+ // If any of the operands is a live-in and not used by VPWidenRecipe or

+ // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as

+ // processed as well. When MinBWs is currently constructed, there is no

+ // information about whether recipes are widened or replicated and in

+ // case they are reciplicated the operands are not truncated. Counting

+ // them them here ensures we do not miss any recipes in MinBWs.

+ // TODO: Remove once the analysis is done on VPlan.

+ for (VPValue *Op : R.operands()) {

+ if (!Op->isLiveIn())

+ continue;

+ auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());

+ if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&

+ all_of(Op->users(), [](VPUser *U) {

+ return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U);

+ })) {

+ // Add an entry to ProcessedTruncs to avoid counting the same

+ // operand multiple times.

+ ProcessedTruncs[Op] = nullptr;

+ NumProcessedRecipes += 1;

+ }

+#endif

+ continue;

+ }

+ Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);

+ unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();

+ assert(OldResTy->isIntegerTy() && "only integer types supported");

+ if (OldResSizeInBits == NewResSizeInBits)

+ continue;

+ assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");

+ (void)OldResSizeInBits;

+ auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);

+ // Shrink operands by introducing truncates as needed.

+ unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;

+ for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {

+ auto *Op = R.getOperand(Idx);

+ unsigned OpSizeInBits =

+ TypeInfo.inferScalarType(Op)->getScalarSizeInBits();

+ if (OpSizeInBits == NewResSizeInBits)

+ continue;

+ assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");

+ auto [ProcessedIter, IterIsEmpty] =

+ ProcessedTruncs.insert({Op, nullptr});

+ VPWidenCastRecipe *NewOp =

+ IterIsEmpty

+ ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)

+ : ProcessedIter->second;

+ R.setOperand(Idx, NewOp);

+ if (!IterIsEmpty)

+ continue;

+ ProcessedIter->second = NewOp;

+ if (!Op->isLiveIn()) {

+ NewOp->insertBefore(&R);

+ } else {

+ PH->appendRecipe(NewOp);

+#ifndef NDEBUG

+ auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());

+ bool IsContained = MinBWs.contains(OpInst);

+ NumProcessedRecipes += IsContained;

+#endif

+ }

+ // Any wrapping introduced by shrinking this operation shouldn't be

+ // considered undefined behavior. So, we can't unconditionally copy

+ // arithmetic wrapping flags to VPW.

+ if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))

+ VPW->dropPoisonGeneratingFlags();

+ // Extend result to original width.

+ auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);

+ Ext->insertAfter(&R);

+ ResultVPV->replaceAllUsesWith(Ext);

+ Ext->setOperand(0, ResultVPV);

+ }

+ assert(MinBWs.size() == NumProcessedRecipes &&

+ "some entries in MinBWs haven't been processed");

+void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {

+ removeRedundantCanonicalIVs(Plan);

+ removeRedundantInductionCasts(Plan);

+ optimizeInductions(Plan, SE);

+ simplifyRecipes(Plan, SE.getContext());

+ removeDeadRecipes(Plan);

+ createAndOptimizeReplicateRegions(Plan);

+ removeRedundantExpandSCEVRecipes(Plan);

+ mergeBlocksIntoPredecessors(Plan);

+// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace

+// the loop terminator with a branch-on-cond recipe with the negated

+// active-lane-mask as operand. Note that this turns the loop into an

+// uncountable one. Only the existing terminator is replaced, all other existing

+// recipes/users remain unchanged, except for poison-generating flags being

+// dropped from the canonical IV increment. Return the created

+// VPActiveLaneMaskPHIRecipe.

+//

+// The function uses the following definitions:

+//

+// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?

+// calculate-trip-count-minus-VF (original TC) : original TC

+// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?

+// CanonicalIVPhi : CanonicalIVIncrement

+// %StartV is the canonical induction start value.

+//

+// The function adds the following recipes:

+//

+// vector.ph:

+// %TripCount = calculate-trip-count-minus-VF (original TC)

+// [if DataWithControlFlowWithoutRuntimeCheck]

+// %EntryInc = canonical-iv-increment-for-part %StartV

+// %EntryALM = active-lane-mask %EntryInc, %TripCount

+//

+// vector.body:

+// ...

+// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]

+// ...

+// %InLoopInc = canonical-iv-increment-for-part %IncrementValue

+// %ALM = active-lane-mask %InLoopInc, TripCount

+// %Negated = Not %ALM

+// branch-on-cond %Negated

+//

+static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(

+ VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {

+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

+ auto *CanonicalIVPHI = Plan.getCanonicalIV();

+ VPValue *StartV = CanonicalIVPHI->getStartValue();

+ auto *CanonicalIVIncrement =

+ cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

+ // TODO: Check if dropping the flags is needed if

+ // !DataAndControlFlowWithoutRuntimeCheck.

+ CanonicalIVIncrement->dropPoisonGeneratingFlags();

+ DebugLoc DL = CanonicalIVIncrement->getDebugLoc();

+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since

+ // we have to take unrolling into account. Each part needs to start at

+ // Part * VF

+ auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());

+ VPBuilder Builder(VecPreheader);

+ // Create the ActiveLaneMask instruction using the correct start values.

+ VPValue *TC = Plan.getTripCount();

+ VPValue *TripCount, *IncrementValue;

+ if (!DataAndControlFlowWithoutRuntimeCheck) {

+ // When the loop is guarded by a runtime overflow check for the loop

+ // induction variable increment by VF, we can increment the value before

+ // the get.active.lane mask and use the unmodified tripcount.

+ IncrementValue = CanonicalIVIncrement;

+ TripCount = TC;

+ } else {

+ // When avoiding a runtime check, the active.lane.mask inside the loop

+ // uses a modified trip count and the induction variable increment is

+ // done after the active.lane.mask intrinsic is called.

+ IncrementValue = CanonicalIVPHI;

+ TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,

+ {TC}, DL);

+ }

+ auto *EntryIncrement = Builder.createOverflowingOp(

+ VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,

+ "index.part.next");

+ // Create the active lane mask instruction in the VPlan preheader.

+ auto *EntryALM =

+ Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},

+ DL, "active.lane.mask.entry");

+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the

+ // preheader ActiveLaneMask instruction.

+ auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());

+ LaneMaskPhi->insertAfter(CanonicalIVPHI);

+ // Create the active lane mask for the next iteration of the loop before the

+ // original terminator.

+ VPRecipeBase *OriginalTerminator = EB->getTerminator();

+ Builder.setInsertPoint(OriginalTerminator);

+ auto *InLoopIncrement =

+ Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,

+ {IncrementValue}, {false, false}, DL);

+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,

+ {InLoopIncrement, TripCount}, DL,

+ "active.lane.mask.next");

+ LaneMaskPhi->addOperand(ALM);

+ // Replace the original terminator with BranchOnCond. We have to invert the

+ // mask here because a true condition means jumping to the exit block.

+ auto *NotMask = Builder.createNot(ALM, DL);

+ Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);

+ OriginalTerminator->eraseFromParent();

+ return LaneMaskPhi;

+void VPlanTransforms::addActiveLaneMask(

+ VPlan &Plan, bool UseActiveLaneMaskForControlFlow,

+ bool DataAndControlFlowWithoutRuntimeCheck) {

+ assert((!DataAndControlFlowWithoutRuntimeCheck ||

+ UseActiveLaneMaskForControlFlow) &&

+ "DataAndControlFlowWithoutRuntimeCheck implies "

+ "UseActiveLaneMaskForControlFlow");

+ auto FoundWidenCanonicalIVUser =

+ find_if(Plan.getCanonicalIV()->users(),

+ [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });

+ assert(FoundWidenCanonicalIVUser &&

+ "Must have widened canonical IV when tail folding!");

+ auto *WideCanonicalIV =

+ cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);

+ VPRecipeBase *LaneMask;

+ if (UseActiveLaneMaskForControlFlow) {

+ LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(

+ Plan, DataAndControlFlowWithoutRuntimeCheck);

+ } else {

+ LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,

+ {WideCanonicalIV, Plan.getTripCount()},

+ nullptr, "active.lane.mask");

+ LaneMask->insertAfter(WideCanonicalIV);

+ }

+ // Walk users of WideCanonicalIV and replace all compares of the form

+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an

+ // active-lane-mask.

+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

+ for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {

+ auto *CompareToReplace = dyn_cast<VPInstruction>(U);

+ if (!CompareToReplace ||

+ CompareToReplace->getOpcode() != Instruction::ICmp ||

+ CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||

+ CompareToReplace->getOperand(1) != BTC)

+ continue;

+ assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&

+ "WidenCanonicalIV must be the first operand of the compare");

+ CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());

+ CompareToReplace->eraseFromParent();

+ }

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3eccf6e9600d..3bf91115debb 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

@@ -22,11 +22,9 @@ class InductionDescriptor;

class Instruction;

class PHINode;

class ScalarEvolution;

-class Loop;

class PredicatedScalarEvolution;

class TargetLibraryInfo;

class VPBuilder;

-class VPRecipeBuilder;

struct VPlanTransforms {

/// Replaces the VPInstructions in \p Plan with corresponding

@@ -37,12 +35,56 @@ struct VPlanTransforms {

GetIntOrFpInductionDescriptor,

ScalarEvolution &SE, const TargetLibraryInfo &TLI);

+ /// Sink users of fixed-order recurrences after the recipe defining their

+ /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions

+ /// to combine the value from the recurrence phis and previous values. The

+ /// current implementation assumes all users can be sunk after the previous

+ /// value, which is enforced by earlier legality checks.

+ /// \returns true if all users of fixed-order recurrences could be re-arranged

+ /// as needed or false if it is not possible. In the latter case, \p Plan is

+ /// not valid.

+ static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);

+ /// Clear NSW/NUW flags from reduction instructions if necessary.

+ static void clearReductionWrapFlags(VPlan &Plan);

+ /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the

+ /// resulting plan to \p BestVF and \p BestUF.

+ static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

+ unsigned BestUF,

+ PredicatedScalarEvolution &PSE);

+ /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe

+ /// optimizations, dead recipe removal, replicate region optimizations and

+ /// block merging.

+ static void optimize(VPlan &Plan, ScalarEvolution &SE);

/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then

/// region block and remove the mask operand. Optimize the created regions by

/// iteratively sinking scalar operands into the region, followed by merging

/// regions until no improvements are remaining.

static void createAndOptimizeReplicateRegions(VPlan &Plan);

+ /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an

+ /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p

+ /// UseActiveLaneMaskForControlFlow is true, introduce an

+ /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is

+ /// true, no minimum-iteration runtime check will be created (during skeleton

+ /// creation) and instead it is handled using active-lane-mask. \p

+ /// DataAndControlFlowWithoutRuntimeCheck implies \p

+ /// UseActiveLaneMaskForControlFlow.

+ static void addActiveLaneMask(VPlan &Plan,

+ bool UseActiveLaneMaskForControlFlow,

+ bool DataAndControlFlowWithoutRuntimeCheck);

+ /// Insert truncates and extends for any truncated recipe. Redundant casts

+ /// will be folded later.

+ static void

+ truncateToMinimalBitwidths(VPlan &Plan,

+ const MapVector<Instruction *, uint64_t> &MinBWs,

+ LLVMContext &Ctx);

+private:

/// Remove redundant VPBasicBlocks by merging them into their predecessor if

/// the predecessor has a single successor.

static bool mergeBlocksIntoPredecessors(VPlan &Plan);

@@ -71,24 +113,6 @@ struct VPlanTransforms {

/// them with already existing recipes expanding the same SCEV expression.

static void removeRedundantExpandSCEVRecipes(VPlan &Plan);

- /// Sink users of fixed-order recurrences after the recipe defining their

- /// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions

- /// to combine the value from the recurrence phis and previous values. The

- /// current implementation assumes all users can be sunk after the previous

- /// value, which is enforced by earlier legality checks.

- /// \returns true if all users of fixed-order recurrences could be re-arranged

- /// as needed or false if it is not possible. In the latter case, \p Plan is

- /// not valid.

- static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);

- /// Clear NSW/NUW flags from reduction instructions if necessary.

- static void clearReductionWrapFlags(VPlan &Plan);

- /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the

- /// resulting plan to \p BestVF and \p BestUF.

- static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

- unsigned BestUF,

- PredicatedScalarEvolution &PSE);

};

} // namespace llvm

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef..116acad8e8f3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h

@@ -121,18 +121,11 @@ public:

/// Remove a single \p User from the list of users.

void removeUser(VPUser &User) {

- bool Found = false;

// The same user can be added multiple times, e.g. because the same VPValue

// is used twice by the same VPUser. Remove a single one.

- erase_if(Users, [&User, &Found](VPUser *Other) {

- if (Found)

- return false;

- if (Other == &User) {

- Found = true;

- return true;

- }

- return false;

- });

+ auto *I = find(Users, &User);

+ if (I != Users.end())

+ Users.erase(I);

}

typedef SmallVectorImpl<VPUser *>::iterator user_iterator;

@@ -163,6 +156,13 @@ public:

void replaceAllUsesWith(VPValue *New);

+ /// Go through the uses list for this VPValue and make each use point to \p

+ /// New if the callback ShouldReplace returns true for the given use specified

+ /// by a pair of (VPUser, the use index).

+ void replaceUsesWithIf(

+ VPValue *New,

+ llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace);

/// Returns the recipe defining this VPValue or nullptr if it is not defined

/// by a recipe, i.e. is a live-in.

VPRecipeBase *getDefiningRecipe();

@@ -296,6 +296,14 @@ public:

"Op must be an operand of the recipe");

return false;

}

+ /// Returns true if the VPUser only uses the first part of operand \p Op.

+ /// Conservatively returns false.

+ virtual bool onlyFirstPartUsed(const VPValue *Op) const {

+ assert(is_contained(operands(), Op) &&

+ "Op must be an operand of the recipe");

+ return false;

+ }

};

/// This class augments a recipe with a set of VPValues defined by the recipe.

@@ -325,7 +333,7 @@ class VPDef {

assert(V->Def == this && "can only remove VPValue linked with this VPDef");

assert(is_contained(DefinedValues, V) &&

"VPValue to remove must be in DefinedValues");

- erase_value(DefinedValues, V);

+ llvm::erase(DefinedValues, V);

V->Def = nullptr;

}

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 13464c9d3496..f18711ba30b7 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

@@ -13,6 +13,8 @@

//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/VectorCombine.h"

+#include "llvm/ADT/DenseMap.h"

+#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/BasicAliasAnalysis.h"

@@ -28,6 +30,7 @@

#include "llvm/Support/CommandLine.h"

#include "llvm/Transforms/Utils/Local.h"

#include <numeric>

+#include <queue>

#define DEBUG_TYPE "vector-combine"

#include "llvm/Transforms/Utils/InstructionWorklist.h"

@@ -100,8 +103,9 @@ private:

Instruction &I);

bool foldExtractExtract(Instruction &I);

bool foldInsExtFNeg(Instruction &I);

- bool foldBitcastShuf(Instruction &I);

+ bool foldBitcastShuffle(Instruction &I);

bool scalarizeBinopOrCmp(Instruction &I);

+ bool scalarizeVPIntrinsic(Instruction &I);

bool foldExtractedCmps(Instruction &I);

bool foldSingleElementStore(Instruction &I);

bool scalarizeLoadExtract(Instruction &I);

@@ -258,8 +262,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

// It is safe and potentially profitable to load a vector directly:

// inselt undef, load Scalar, 0 --> load VecPtr

IRBuilder<> Builder(Load);

- Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(

- SrcPtr, MinVecTy->getPointerTo(AS));

+ Value *CastedPtr =

+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));

Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);

VecLd = Builder.CreateShuffleVector(VecLd, Mask);

@@ -321,7 +325,7 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {

IRBuilder<> Builder(Load);

Value *CastedPtr =

- Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS));

+ Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));

Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);

replaceValue(I, *VecLd);

++NumVecLoad;

@@ -677,7 +681,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {

/// If this is a bitcast of a shuffle, try to bitcast the source vector to the

/// destination type followed by shuffle. This can enable further transforms by

/// moving bitcasts or shuffles together.

-bool VectorCombine::foldBitcastShuf(Instruction &I) {

+bool VectorCombine::foldBitcastShuffle(Instruction &I) {

Value *V;

ArrayRef<int> Mask;

if (!match(&I, m_BitCast(

@@ -687,35 +691,43 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {

// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for

// scalable type is unknown; Second, we cannot reason if the narrowed shuffle

// mask for scalable type is a splat or not.

- // 2) Disallow non-vector casts and length-changing shuffles.

+ // 2) Disallow non-vector casts.

// TODO: We could allow any shuffle.

+ auto *DestTy = dyn_cast<FixedVectorType>(I.getType());

auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());

- if (!SrcTy || I.getOperand(0)->getType() != SrcTy)

+ if (!DestTy || !SrcTy)

+ return false;

+ unsigned DestEltSize = DestTy->getScalarSizeInBits();

+ unsigned SrcEltSize = SrcTy->getScalarSizeInBits();

+ if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)

return false;

- auto *DestTy = cast<FixedVectorType>(I.getType());

- unsigned DestNumElts = DestTy->getNumElements();

- unsigned SrcNumElts = SrcTy->getNumElements();

SmallVector<int, 16> NewMask;

- if (SrcNumElts <= DestNumElts) {

+ if (DestEltSize <= SrcEltSize) {

// The bitcast is from wide to narrow/equal elements. The shuffle mask can

// always be expanded to the equivalent form choosing narrower elements.

- assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");

- unsigned ScaleFactor = DestNumElts / SrcNumElts;

+ assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");

+ unsigned ScaleFactor = SrcEltSize / DestEltSize;

narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);

} else {

// The bitcast is from narrow elements to wide elements. The shuffle mask

// must choose consecutive elements to allow casting first.

- assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");

- unsigned ScaleFactor = SrcNumElts / DestNumElts;

+ assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");

+ unsigned ScaleFactor = DestEltSize / SrcEltSize;

if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))

return false;

}

+ // Bitcast the shuffle src - keep its original width but using the destination

+ // scalar type.

+ unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;

+ auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);

// The new shuffle must not cost more than the old shuffle. The bitcast is

// moved ahead of the shuffle, so assume that it has the same cost as before.

InstructionCost DestCost = TTI.getShuffleCost(

- TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);

+ TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);

InstructionCost SrcCost =

TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);

if (DestCost > SrcCost || !DestCost.isValid())

@@ -723,12 +735,131 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {

// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'

++NumShufOfBitcast;

- Value *CastV = Builder.CreateBitCast(V, DestTy);

+ Value *CastV = Builder.CreateBitCast(V, ShuffleTy);

Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);

replaceValue(I, *Shuf);

return true;

}

+/// VP Intrinsics whose vector operands are both splat values may be simplified

+/// into the scalar version of the operation and the result splatted. This

+/// can lead to scalarization down the line.

+bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {

+ if (!isa<VPIntrinsic>(I))

+ return false;

+ VPIntrinsic &VPI = cast<VPIntrinsic>(I);

+ Value *Op0 = VPI.getArgOperand(0);

+ Value *Op1 = VPI.getArgOperand(1);

+ if (!isSplatValue(Op0) || !isSplatValue(Op1))

+ return false;

+ // Check getSplatValue early in this function, to avoid doing unnecessary

+ // work.

+ Value *ScalarOp0 = getSplatValue(Op0);

+ Value *ScalarOp1 = getSplatValue(Op1);

+ if (!ScalarOp0 || !ScalarOp1)

+ return false;

+ // For the binary VP intrinsics supported here, the result on disabled lanes

+ // is a poison value. For now, only do this simplification if all lanes

+ // are active.

+ // TODO: Relax the condition that all lanes are active by using insertelement

+ // on inactive lanes.

+ auto IsAllTrueMask = [](Value *MaskVal) {

+ if (Value *SplattedVal = getSplatValue(MaskVal))

+ if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))

+ return ConstValue->isAllOnesValue();

+ return false;

+ };

+ if (!IsAllTrueMask(VPI.getArgOperand(2)))

+ return false;

+ // Check to make sure we support scalarization of the intrinsic

+ Intrinsic::ID IntrID = VPI.getIntrinsicID();

+ if (!VPBinOpIntrinsic::isVPBinOp(IntrID))

+ return false;

+ // Calculate cost of splatting both operands into vectors and the vector

+ // intrinsic

+ VectorType *VecTy = cast<VectorType>(VPI.getType());

+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

+ InstructionCost SplatCost =

+ TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +

+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

+ // Calculate the cost of the VP Intrinsic

+ SmallVector<Type *, 4> Args;

+ for (Value *V : VPI.args())

+ Args.push_back(V->getType());

+ IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);

+ InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);

+ InstructionCost OldCost = 2 * SplatCost + VectorOpCost;

+ // Determine scalar opcode

+ std::optional<unsigned> FunctionalOpcode =

+ VPI.getFunctionalOpcode();

+ std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;

+ if (!FunctionalOpcode) {

+ ScalarIntrID = VPI.getFunctionalIntrinsicID();

+ if (!ScalarIntrID)

+ return false;

+ }

+ // Calculate cost of scalarizing

+ InstructionCost ScalarOpCost = 0;

+ if (ScalarIntrID) {

+ IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);

+ ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);

+ } else {

+ ScalarOpCost =

+ TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType());

+ }

+ // The existing splats may be kept around if other instructions use them.

+ InstructionCost CostToKeepSplats =

+ (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());

+ InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;

+ LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI

+ << "\n");

+ LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost

+ << ", Cost of scalarizing:" << NewCost << "\n");

+ // We want to scalarize unless the vector variant actually has lower cost.

+ if (OldCost < NewCost || !NewCost.isValid())

+ return false;

+ // Scalarize the intrinsic

+ ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();

+ Value *EVL = VPI.getArgOperand(3);

+ const DataLayout &DL = VPI.getModule()->getDataLayout();

+ // If the VP op might introduce UB or poison, we can scalarize it provided

+ // that we know the EVL > 0: If the EVL is zero, then the original VP op

+ // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by

+ // scalarizing it.

+ bool SafeToSpeculate;

+ if (ScalarIntrID)

+ SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID)

+ .hasFnAttr(Attribute::AttrKind::Speculatable);

+ else

+ SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(

+ *FunctionalOpcode, &VPI, nullptr, &AC, &DT);

+ if (!SafeToSpeculate && !isKnownNonZero(EVL, DL, 0, &AC, &VPI, &DT))

+ return false;

+ Value *ScalarVal =

+ ScalarIntrID

+ ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,

+ {ScalarOp0, ScalarOp1})

+ : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),

+ ScalarOp0, ScalarOp1);

+ replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));

+ return true;

/// Match a vector binop or compare instruction with at least one inserted

/// scalar operand and convert to scalar binop/cmp followed by insertelement.

bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {

@@ -1013,19 +1144,24 @@ public:

/// Check if it is legal to scalarize a memory access to \p VecTy at index \p

/// Idx. \p Idx must access a valid vector element.

-static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,

- Value *Idx, Instruction *CtxI,

+static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,

+ Instruction *CtxI,

AssumptionCache &AC,

const DominatorTree &DT) {

+ // We do checks for both fixed vector types and scalable vector types.

+ // This is the number of elements of fixed vector types,

+ // or the minimum number of elements of scalable vector types.

+ uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();

if (auto *C = dyn_cast<ConstantInt>(Idx)) {

- if (C->getValue().ult(VecTy->getNumElements()))

+ if (C->getValue().ult(NumElements))

return ScalarizationResult::safe();

return ScalarizationResult::unsafe();

}

unsigned IntWidth = Idx->getType()->getScalarSizeInBits();

APInt Zero(IntWidth, 0);

- APInt MaxElts(IntWidth, VecTy->getNumElements());

+ APInt MaxElts(IntWidth, NumElements);

ConstantRange ValidIndices(Zero, MaxElts);

ConstantRange IdxRange(IntWidth, true);

@@ -1074,8 +1210,7 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,

// store i32 %b, i32* %1

bool VectorCombine::foldSingleElementStore(Instruction &I) {

auto *SI = cast<StoreInst>(&I);

- if (!SI->isSimple() ||

- !isa<FixedVectorType>(SI->getValueOperand()->getType()))

+ if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))

return false;

// TODO: Combine more complicated patterns (multiple insert) by referencing

@@ -1089,13 +1224,13 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {

return false;

if (auto *Load = dyn_cast<LoadInst>(Source)) {

- auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());

+ auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());

const DataLayout &DL = I.getModule()->getDataLayout();

Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();

// Don't optimize for atomic/volatile load or store. Ensure memory is not

// modified between, vector type matches store size, and index is inbounds.

if (!Load->isSimple() || Load->getParent() != SI->getParent() ||

- !DL.typeSizeEqualsStoreSize(Load->getType()) ||

+ !DL.typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||

SrcAddr != SI->getPointerOperand()->stripPointerCasts())

return false;

@@ -1130,19 +1265,26 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

if (!match(&I, m_Load(m_Value(Ptr))))

return false;

- auto *FixedVT = cast<FixedVectorType>(I.getType());

+ auto *VecTy = cast<VectorType>(I.getType());

auto *LI = cast<LoadInst>(&I);

const DataLayout &DL = I.getModule()->getDataLayout();

- if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT))

+ if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(VecTy->getScalarType()))

return false;

InstructionCost OriginalCost =

- TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(),

+ TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),

LI->getPointerAddressSpace());

InstructionCost ScalarizedCost = 0;

Instruction *LastCheckedInst = LI;

unsigned NumInstChecked = 0;

+ DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;

+ auto FailureGuard = make_scope_exit([&]() {

+ // If the transform is aborted, discard the ScalarizationResults.

+ for (auto &Pair : NeedFreeze)

+ Pair.second.discard();

+ });

// Check if all users of the load are extracts with no memory modifications

// between the load and the extract. Compute the cost of both the original

// code and the scalarized version.

@@ -1151,9 +1293,6 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

if (!UI || UI->getParent() != LI->getParent())

return false;

- if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT))

- return false;

// Check if any instruction between the load and the extract may modify

// memory.

if (LastCheckedInst->comesBefore(UI)) {

@@ -1168,22 +1307,23 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

LastCheckedInst = UI;

}

- auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);

- if (!ScalarIdx.isSafe()) {

- // TODO: Freeze index if it is safe to do so.

- ScalarIdx.discard();

+ auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);

+ if (ScalarIdx.isUnsafe())

return false;

+ if (ScalarIdx.isSafeWithFreeze()) {

+ NeedFreeze.try_emplace(UI, ScalarIdx);

+ ScalarIdx.discard();

}

auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

OriginalCost +=

- TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind,

+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,

Index ? Index->getZExtValue() : -1);

ScalarizedCost +=

- TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),

+ TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),

Align(1), LI->getPointerAddressSpace());

- ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());

+ ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType());

}

if (ScalarizedCost >= OriginalCost)

@@ -1192,21 +1332,27 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {

// Replace extracts with narrow scalar loads.

for (User *U : LI->users()) {

auto *EI = cast<ExtractElementInst>(U);

- Builder.SetInsertPoint(EI);

Value *Idx = EI->getOperand(1);

+ // Insert 'freeze' for poison indexes.

+ auto It = NeedFreeze.find(EI);

+ if (It != NeedFreeze.end())

+ It->second.freeze(Builder, *cast<Instruction>(Idx));

+ Builder.SetInsertPoint(EI);

Value *GEP =

- Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx});

+ Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});

auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(

- FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));

+ VecTy->getElementType(), GEP, EI->getName() + ".scalar"));

Align ScalarOpAlignment = computeAlignmentAfterScalarization(

- LI->getAlign(), FixedVT->getElementType(), Idx, DL);

+ LI->getAlign(), VecTy->getElementType(), Idx, DL);

NewLoad->setAlignment(ScalarOpAlignment);

replaceValue(*EI, *NewLoad);

}

+ FailureGuard.release();

return true;

}

@@ -1340,21 +1486,28 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {

dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());

if (!ShuffleInputType)

return false;

- int NumInputElts = ShuffleInputType->getNumElements();

+ unsigned NumInputElts = ShuffleInputType->getNumElements();

// Find the mask from sorting the lanes into order. This is most likely to

// become a identity or concat mask. Undef elements are pushed to the end.

SmallVector<int> ConcatMask;

Shuffle->getShuffleMask(ConcatMask);

sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });

+ // In the case of a truncating shuffle it's possible for the mask

+ // to have an index greater than the size of the resulting vector.

+ // This requires special handling.

+ bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;

bool UsesSecondVec =

- any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });

+ any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });

+ FixedVectorType *VecTyForCost =

+ (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;

InstructionCost OldCost = TTI.getShuffleCost(

- UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,

- Shuffle->getShuffleMask());

+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,

+ VecTyForCost, Shuffle->getShuffleMask());

InstructionCost NewCost = TTI.getShuffleCost(

- UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,

- ConcatMask);

+ UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,

+ VecTyForCost, ConcatMask);

LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle

<< "\n");

@@ -1657,16 +1810,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {

return SSV->getOperand(Op);

return SV->getOperand(Op);

};

- Builder.SetInsertPoint(SVI0A->getInsertionPointAfterDef());

+ Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());

Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),

GetShuffleOperand(SVI0A, 1), V1A);

- Builder.SetInsertPoint(SVI0B->getInsertionPointAfterDef());

+ Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());

Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),

GetShuffleOperand(SVI0B, 1), V1B);

- Builder.SetInsertPoint(SVI1A->getInsertionPointAfterDef());

+ Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());

Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),

GetShuffleOperand(SVI1A, 1), V2A);

- Builder.SetInsertPoint(SVI1B->getInsertionPointAfterDef());

+ Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());

Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),

GetShuffleOperand(SVI1B, 1), V2B);

Builder.SetInsertPoint(Op0);

@@ -1723,9 +1876,6 @@ bool VectorCombine::run() {

case Instruction::ShuffleVector:

MadeChange |= widenSubvectorLoad(I);

break;

- case Instruction::Load:

- MadeChange |= scalarizeLoadExtract(I);

- break;

default:

break;

}

@@ -1733,13 +1883,15 @@ bool VectorCombine::run() {

// This transform works with scalable and fixed vectors

// TODO: Identify and allow other scalable transforms

- if (isa<VectorType>(I.getType()))

+ if (isa<VectorType>(I.getType())) {

MadeChange |= scalarizeBinopOrCmp(I);

+ MadeChange |= scalarizeLoadExtract(I);

+ MadeChange |= scalarizeVPIntrinsic(I);

+ }

if (Opcode == Instruction::Store)

MadeChange |= foldSingleElementStore(I);

// If this is an early pipeline invocation of this pass, we are done.

if (TryEarlyFoldsOnly)

return;

@@ -1758,7 +1910,7 @@ bool VectorCombine::run() {

MadeChange |= foldSelectShuffle(I);

break;

case Instruction::BitCast:

- MadeChange |= foldBitcastShuf(I);

+ MadeChange |= foldBitcastShuffle(I);

break;

}

} else {