src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-12-02 21:49:08 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-06-04 11:59:04 +0000
commit	574b7079b96703a748f89ef5adb7dc3e26b8f7fc (patch)
tree	195000196b1e0cc13dea43258fa240e006f48184 /contrib/llvm-project/llvm/lib/Transforms/Vectorize
parent	1f6fd64fe9c996b4795ee4a6c66b8f9216747560 (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

980

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

612

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

4 files changed, 1027 insertions, 608 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23bb6f0860c9..5ca0adb4242c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -473,18 +473,10 @@ public:

/// handle the more complex control flow around the loops.

virtual BasicBlock *createVectorizedLoopSkeleton();

- /// Widen a single instruction within the innermost loop.

- void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,

- VPTransformState &State);

/// Widen a single call instruction within the innermost loop.

void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,

VPTransformState &State);

- /// Widen a single select instruction within the innermost loop.

- void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,

- bool InvariantCond, VPTransformState &State);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.

void fixVectorizedLoop(VPTransformState &State);

@@ -496,12 +488,6 @@ public:

/// new unrolled loop, where UF is the unroll factor.

using VectorParts = SmallVector<Value *, 2>;

- /// Vectorize a single GetElementPtrInst based on information gathered and

- /// decisions taken during planning.

- void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,

- unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,

- SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);

/// Vectorize a single first-order recurrence or pointer induction PHINode in

/// a block. This method handles the induction variable canonicalization. It

/// supports both VF = 1 for unrolled loops and arbitrary length vectors.

@@ -511,9 +497,9 @@ public:

/// A helper function to scalarize a single Instruction in the innermost loop.

/// Generates a sequence of scalar instances for each lane between \p MinLane

/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,

- /// inclusive. Uses the VPValue operands from \p Operands instead of \p

+ /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p

/// Instr's operands.

- void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,

+ void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,

const VPIteration &Instance, bool IfPredicateInstr,

VPTransformState &State);

@@ -538,15 +524,6 @@ public:

ArrayRef<VPValue *> StoredValues,

VPValue *BlockInMask = nullptr);

- /// Vectorize Load and Store instructions with the base address given in \p

- /// Addr, optionally masking the vector operations if \p BlockInMask is

- /// non-null. Use \p State to translate given VPValues to IR values in the

- /// vectorized loop.

- void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,

- VPValue *Def, VPValue *Addr,

- VPValue *StoredValue, VPValue *BlockInMask,

- bool ConsecutiveStride, bool Reverse);

/// Set the debug location in the builder \p Ptr using the debug location in

/// \p V. If \p Ptr is None then it uses the class member's Builder.

void setDebugLocFromInst(const Value *V,

@@ -566,6 +543,17 @@ public:

/// element.

virtual Value *getBroadcastInstrs(Value *V);

+ /// Add metadata from one instruction to another.

+ ///

+ /// This includes both the original MDs from \p From and additional ones (\see

+ /// addNewMetadata). Use this for *newly created* instructions in the vector

+ /// loop.

+ void addMetadata(Instruction *To, Instruction *From);

+ /// Similar to the previous function but it adds the metadata to a

+ /// vector of instructions.

+ void addMetadata(ArrayRef<Value *> To, Instruction *From);

protected:

friend class LoopVectorizationPlanner;

@@ -741,16 +729,16 @@ protected:

/// vector loop.

void addNewMetadata(Instruction *To, const Instruction *Orig);

- /// Add metadata from one instruction to another.

- ///

- /// This includes both the original MDs from \p From and additional ones (\see

- /// addNewMetadata). Use this for *newly created* instructions in the vector

- /// loop.

- void addMetadata(Instruction *To, Instruction *From);

- /// Similar to the previous function but it adds the metadata to a

- /// vector of instructions.

- void addMetadata(ArrayRef<Value *> To, Instruction *From);

+ /// Collect poison-generating recipes that may generate a poison value that is

+ /// used after vectorization, even when their operands are not poison. Those

+ /// recipes meet the following conditions:

+ /// * Contribute to the address computation of a recipe generating a widen

+ /// memory load/store (VPWidenMemoryInstructionRecipe or

+ /// VPInterleaveRecipe).

+ /// * Such a widen memory load/store has at least one underlying Instruction

+ /// that is in a basic block that needs predication and after vectorization

+ /// the generated instruction won't be predicated.

+ void collectPoisonGeneratingRecipes(VPTransformState &State);

/// Allow subclasses to override and print debug traces before/after vplan

/// execution, when trace information is requested.

@@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To,

LVer->annotateInstWithNoAlias(To, Orig);

}

+void InnerLoopVectorizer::collectPoisonGeneratingRecipes(

+ VPTransformState &State) {

+ // Collect recipes in the backward slice of `Root` that may generate a poison

+ // value that is used after vectorization.

+ SmallPtrSet<VPRecipeBase *, 16> Visited;

+ auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {

+ SmallVector<VPRecipeBase *, 16> Worklist;

+ Worklist.push_back(Root);

+ // Traverse the backward slice of Root through its use-def chain.

+ while (!Worklist.empty()) {

+ VPRecipeBase *CurRec = Worklist.back();

+ Worklist.pop_back();

+ if (!Visited.insert(CurRec).second)

+ continue;

+ // Prune search if we find another recipe generating a widen memory

+ // instruction. Widen memory instructions involved in address computation

+ // will lead to gather/scatter instructions, which don't need to be

+ // handled.

+ if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||

+ isa<VPInterleaveRecipe>(CurRec))

+ continue;

+ // This recipe contributes to the address computation of a widen

+ // load/store. Collect recipe if its underlying instruction has

+ // poison-generating flags.

+ Instruction *Instr = CurRec->getUnderlyingInstr();

+ if (Instr && Instr->hasPoisonGeneratingFlags())

+ State.MayGeneratePoisonRecipes.insert(CurRec);

+ // Add new definitions to the worklist.

+ for (VPValue *operand : CurRec->operands())

+ if (VPDef *OpDef = operand->getDef())

+ Worklist.push_back(cast<VPRecipeBase>(OpDef));

+ }

+ });

+ // Traverse all the recipes in the VPlan and collect the poison-generating

+ // recipes in the backward slice starting at the address of a VPWidenRecipe or

+ // VPInterleaveRecipe.

+ auto Iter = depth_first(

+ VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));

+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

+ for (VPRecipeBase &Recipe : *VPBB) {

+ if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {

+ Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();

+ VPDef *AddrDef = WidenRec->getAddr()->getDef();

+ if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&

+ Legal->blockNeedsPredication(UnderlyingInstr->getParent()))

+ collectPoisonGeneratingInstrsInBackwardSlice(

+ cast<VPRecipeBase>(AddrDef));

+ } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {

+ VPDef *AddrDef = InterleaveRec->getAddr()->getDef();

+ if (AddrDef) {

+ // Check if any member of the interleave group needs predication.

+ const InterleaveGroup<Instruction> *InterGroup =

+ InterleaveRec->getInterleaveGroup();

+ bool NeedPredication = false;

+ for (int I = 0, NumMembers = InterGroup->getNumMembers();

+ I < NumMembers; ++I) {

+ Instruction *Member = InterGroup->getMember(I);

+ if (Member)

+ NeedPredication |=

+ Legal->blockNeedsPredication(Member->getParent());

+ }

+ if (NeedPredication)

+ collectPoisonGeneratingInstrsInBackwardSlice(

+ cast<VPRecipeBase>(AddrDef));

+ }

void InnerLoopVectorizer::addMetadata(Instruction *To,

Instruction *From) {

propagateMetadata(To, From);

@@ -1541,7 +1607,16 @@ public:

// Returns true if \p I is an instruction that will be predicated either

// through scalar predication or masked load/store or masked gather/scatter.

// Superset of instructions that return true for isScalarWithPredication.

- bool isPredicatedInst(Instruction *I) {

+ bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {

+ // When we know the load is uniform and the original scalar loop was not

+ // predicated we don't need to mark it as a predicated instruction. Any

+ // vectorised blocks created when tail-folding are something artificial we

+ // have introduced and we know there is always at least one active lane.

+ // That's why we call Legal->blockNeedsPredication here because it doesn't

+ // query tail-folding.

+ if (IsKnownUniform && isa<LoadInst>(I) &&

+ !Legal->blockNeedsPredication(I->getParent()))

+ return false;

if (!blockNeedsPredicationForAnyReason(I->getParent()))

return false;

// Loads and stores that need some form of masked operation are predicated

@@ -1816,9 +1891,11 @@ private:

/// Collect the instructions that are scalar after vectorization. An

/// instruction is scalar if it is known to be uniform or will be scalarized

- /// during vectorization. Non-uniform scalarized instructions will be

- /// represented by VF values in the vectorized loop, each corresponding to an

- /// iteration of the original scalar loop.

+ /// during vectorization. collectLoopScalars should only add non-uniform nodes

+ /// to the list if they are used by a load/store instruction that is marked as

+ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by

+ /// VF values in the vectorized loop, each corresponding to an iteration of

+ /// the original scalar loop.

void collectLoopScalars(ElementCount VF);

/// Keeps cost model vectorization decision and cost for instructions.

@@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(

}

-void InnerLoopVectorizer::vectorizeMemoryInstruction(

- Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,

- VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,

- bool Reverse) {

- // Attempt to issue a wide load.

- LoadInst *LI = dyn_cast<LoadInst>(Instr);

- StoreInst *SI = dyn_cast<StoreInst>(Instr);

- assert((LI || SI) && "Invalid Load/Store instruction");

- assert((!SI || StoredValue) && "No stored value provided for widened store");

- assert((!LI || !StoredValue) && "Stored value provided for widened load");

- Type *ScalarDataTy = getLoadStoreType(Instr);

- auto *DataTy = VectorType::get(ScalarDataTy, VF);

- const Align Alignment = getLoadStoreAlignment(Instr);

- bool CreateGatherScatter = !ConsecutiveStride;

- VectorParts BlockInMaskParts(UF);

- bool isMaskRequired = BlockInMask;

- if (isMaskRequired)

- for (unsigned Part = 0; Part < UF; ++Part)

- BlockInMaskParts[Part] = State.get(BlockInMask, Part);

- const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

- // Calculate the pointer for the specific unroll-part.

- GetElementPtrInst *PartPtr = nullptr;

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

- InBounds = gep->isInBounds();

- if (Reverse) {

- // If the address is consecutive but reversed, then the

- // wide store needs to start at the last vector element.

- // RunTimeVF = VScale * VF.getKnownMinValue()

- // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()

- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);

- // NumElt = -Part * RunTimeVF

- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);

- // LastLane = 1 - RunTimeVF

- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);

- PartPtr =

- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));

- PartPtr->setIsInBounds(InBounds);

- PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));

- PartPtr->setIsInBounds(InBounds);

- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

- BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);

- } else {

- Value *Increment =

- createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);

- PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));

- PartPtr->setIsInBounds(InBounds);

- }

- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

- };

- // Handle Stores:

- if (SI) {

- setDebugLocFromInst(SI);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Instruction *NewSI = nullptr;

- Value *StoredVal = State.get(StoredValue, Part);

- if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

- Value *VectorGep = State.get(Addr, Part);

- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,

- MaskPart);

- } else {

- if (Reverse) {

- // If we store to reverse consecutive memory locations, then we need

- // to reverse the order of elements in the stored value.

- StoredVal = reverseVector(StoredVal);

- // We don't want to update the value in the map as it might be used in

- // another expression. So don't call resetVectorValue(StoredVal).

- }

- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));

- if (isMaskRequired)

- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,

- BlockInMaskParts[Part]);

- else

- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);

- }

- addMetadata(NewSI, SI);

- }

- return;

- }

- // Handle loads.

- assert(LI && "Must have a load instruction");

- setDebugLocFromInst(LI);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *NewLI;

- if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

- Value *VectorGep = State.get(Addr, Part);

- NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,

- nullptr, "wide.masked.gather");

- addMetadata(NewLI, LI);

- } else {

- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));

- if (isMaskRequired)

- NewLI = Builder.CreateMaskedLoad(

- DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

- PoisonValue::get(DataTy), "wide.masked.load");

- else

- NewLI =

- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

- // Add metadata to the load, but setVectorValue to the reverse shuffle.

- addMetadata(NewLI, LI);

- if (Reverse)

- NewLI = reverseVector(NewLI);

- }

- State.set(Def, NewLI, Part);

- }

-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,

- VPUser &User,

+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

+ VPReplicateRecipe *RepRecipe,

const VPIteration &Instance,

bool IfPredicateInstr,

VPTransformState &State) {

@@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,

if (!IsVoidRetTy)

Cloned->setName(Instr->getName() + ".cloned");

+ // If the scalarized instruction contributes to the address computation of a

+ // widen masked load/store which was in a basic block that needed predication

+ // and is not predicated after vectorization, we can't propagate

+ // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized

+ // instruction could feed a poison value to the base address of the widen

+ // load/store.

+ if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)

+ Cloned->dropPoisonGeneratingFlags();

State.Builder.SetInsertPoint(Builder.GetInsertBlock(),

Builder.GetInsertPoint());

// Replace the operands of the cloned instructions with their scalar

// equivalents in the new loop.

- for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {

+ for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {

auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));

auto InputInstance = Instance;

if (!Operand || !OrigLoop->contains(Operand) ||

(Cost->isUniformAfterVectorization(Operand, State.VF)))

InputInstance.Lane = VPLane::getFirstLane();

- auto *NewOp = State.get(User.getOperand(op), InputInstance);

+ auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);

Cloned->setOperand(op, NewOp);

}

addNewMetadata(Cloned, Instr);

@@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,

// Place the cloned scalar in the new loop.

Builder.Insert(Cloned);

- State.set(Def, Cloned, Instance);

+ State.set(RepRecipe, Cloned, Instance);

// If we just cloned a new assumption, add it the assumption cache.

if (auto *II = dyn_cast<AssumeInst>(Cloned))

@@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {

return Cost->useOrderedReductions(RdxDesc);

}

-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,

- VPUser &Operands, unsigned UF,

- ElementCount VF, bool IsPtrLoopInvariant,

- SmallBitVector &IsIndexLoopInvariant,

- VPTransformState &State) {

- // Construct a vector GEP by widening the operands of the scalar GEP as

- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

- // results in a vector of pointers when at least one operand of the GEP

- // is vector-typed. Thus, to keep the representation compact, we only use

- // vector-typed operands for loop-varying values.

- if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

- // If we are vectorizing, but the GEP has only loop-invariant operands,

- // the GEP we build (by only using vector-typed operands for

- // loop-varying values) would be a scalar pointer. Thus, to ensure we

- // produce a vector of pointers, we need to either arbitrarily pick an

- // operand to broadcast, or broadcast a clone of the original GEP.

- // Here, we broadcast a clone of the original.

- //

- // TODO: If at some point we decide to scalarize instructions having

- // loop-invariant operands, this special case will no longer be

- // required. We would add the scalarization decision to

- // collectLoopScalars() and teach getVectorValue() to broadcast

- // the lane-zero scalar value.

- auto *Clone = Builder.Insert(GEP->clone());

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);

- State.set(VPDef, EntryPart, Part);

- addMetadata(EntryPart, GEP);

- }

- } else {

- // If the GEP has at least one loop-varying operand, we are sure to

- // produce a vector of pointers. But if we are only unrolling, we want

- // to produce a scalar GEP for each unroll part. Thus, the GEP we

- // produce with the code below will be scalar (if VF == 1) or vector

- // (otherwise). Note that for the unroll-only case, we still maintain

- // values in the vector mapping with initVector, as we do for other

- // instructions.

- for (unsigned Part = 0; Part < UF; ++Part) {

- // The pointer operand of the new GEP. If it's loop-invariant, we

- // won't broadcast it.

- auto *Ptr = IsPtrLoopInvariant

- ? State.get(Operands.getOperand(0), VPIteration(0, 0))

- : State.get(Operands.getOperand(0), Part);

- // Collect all the indices for the new GEP. If any index is

- // loop-invariant, we won't broadcast it.

- SmallVector<Value *, 4> Indices;

- for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {

- VPValue *Operand = Operands.getOperand(I);

- if (IsIndexLoopInvariant[I - 1])

- Indices.push_back(State.get(Operand, VPIteration(0, 0)));

- else

- Indices.push_back(State.get(Operand, Part));

- }

- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

- // but it should be a vector, otherwise.

- auto *NewGEP =

- GEP->isInBounds()

- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

- Indices)

- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

- assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&

- "NewGEP is not a pointer vector");

- State.set(VPDef, NewGEP, Part);

- addMetadata(NewGEP, GEP);

- }

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,

VPWidenPHIRecipe *PhiR,

VPTransformState &State) {

@@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,

// iteration. If the instruction is uniform, we only need to generate the

// first lane. Otherwise, we generate all VF values.

bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);

- unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();

- bool NeedsVectorIndex = !IsUniform && VF.isScalable();

- Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;

- if (NeedsVectorIndex) {

- Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);

- UnitStepVec = Builder.CreateStepVector(VecIVTy);

- PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);

- }

+ assert((IsUniform || !State.VF.isScalable()) &&

+ "Cannot scalarize a scalable VF");

+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();

for (unsigned Part = 0; Part < UF; ++Part) {

Value *PartStart =

createStepForVF(Builder, PtrInd->getType(), VF, Part);

- if (NeedsVectorIndex) {

- // Here we cache the whole vector, which means we can support the

- // extraction of any lane. However, in some cases the extractelement

- // instruction that is generated for scalar uses of this vector (e.g.

- // a load instruction) is not folded away. Therefore we still

- // calculate values for the first n lanes to avoid redundant moves

- // (when extracting the 0th element) and to produce scalar code (i.e.

- // additional add/gep instructions instead of expensive extractelement

- // instructions) when extracting higher-order elements.

- Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);

- Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);

- Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);

- Value *SclrGep =

- emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);

- SclrGep->setName("next.gep");

- State.set(PhiR, SclrGep, Part);

- }

for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

Value *Idx = Builder.CreateAdd(

PartStart, ConstantInt::get(PtrInd->getType(), Lane));

@@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) {

return !CInt || CInt->isZero();

}

-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,

- VPUser &User,

- VPTransformState &State) {

- switch (I.getOpcode()) {

- case Instruction::Call:

- case Instruction::Br:

- case Instruction::PHI:

- case Instruction::GetElementPtr:

- case Instruction::Select:

- llvm_unreachable("This instruction is handled by a different recipe.");

- case Instruction::UDiv:

- case Instruction::SDiv:

- case Instruction::SRem:

- case Instruction::URem:

- case Instruction::Add:

- case Instruction::FAdd:

- case Instruction::Sub:

- case Instruction::FSub:

- case Instruction::FNeg:

- case Instruction::Mul:

- case Instruction::FMul:

- case Instruction::FDiv:

- case Instruction::FRem:

- case Instruction::Shl:

- case Instruction::LShr:

- case Instruction::AShr:

- case Instruction::And:

- case Instruction::Or:

- case Instruction::Xor: {

- // Just widen unops and binops.

- setDebugLocFromInst(&I);

- for (unsigned Part = 0; Part < UF; ++Part) {

- SmallVector<Value *, 2> Ops;

- for (VPValue *VPOp : User.operands())

- Ops.push_back(State.get(VPOp, Part));

- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

- if (auto *VecOp = dyn_cast<Instruction>(V))

- VecOp->copyIRFlags(&I);

- // Use this vector value for all users of the original instruction.

- State.set(Def, V, Part);

- addMetadata(V, &I);

- }

- break;

- }

- case Instruction::ICmp:

- case Instruction::FCmp: {

- // Widen compares. Generate vector compares.

- bool FCmp = (I.getOpcode() == Instruction::FCmp);

- auto *Cmp = cast<CmpInst>(&I);

- setDebugLocFromInst(Cmp);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *A = State.get(User.getOperand(0), Part);

- Value *B = State.get(User.getOperand(1), Part);

- Value *C = nullptr;

- if (FCmp) {

- // Propagate fast math flags.

- IRBuilder<>::FastMathFlagGuard FMFG(Builder);

- Builder.setFastMathFlags(Cmp->getFastMathFlags());

- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);

- } else {

- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);

- }

- State.set(Def, C, Part);

- addMetadata(C, &I);

- }

- break;

- }

- case Instruction::ZExt:

- case Instruction::SExt:

- case Instruction::FPToUI:

- case Instruction::FPToSI:

- case Instruction::FPExt:

- case Instruction::PtrToInt:

- case Instruction::IntToPtr:

- case Instruction::SIToFP:

- case Instruction::UIToFP:

- case Instruction::Trunc:

- case Instruction::FPTrunc:

- case Instruction::BitCast: {

- auto *CI = cast<CastInst>(&I);

- setDebugLocFromInst(CI);

- /// Vectorize casts.

- Type *DestTy =

- (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *A = State.get(User.getOperand(0), Part);

- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);

- State.set(Def, Cast, Part);

- addMetadata(Cast, &I);

- }

- break;

- }

- default:

- // This instruction is not vectorized by simple widening.

- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);

- llvm_unreachable("Unhandled instruction!");

- } // end of switch.

void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,

VPUser &ArgOperands,

VPTransformState &State) {

@@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,

}

-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,

- VPUser &Operands,

- bool InvariantCond,

- VPTransformState &State) {

- setDebugLocFromInst(&I);

- // The condition can be loop invariant but still defined inside the

- // loop. This means that we can't just use the original 'cond' value.

- // We have to take the 'vectorized' value and pick the first lane.

- // Instcombine will make this a no-op.

- auto *InvarCond = InvariantCond

- ? State.get(Operands.getOperand(0), VPIteration(0, 0))

- : nullptr;

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *Cond =

- InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);

- Value *Op0 = State.get(Operands.getOperand(1), Part);

- Value *Op1 = State.get(Operands.getOperand(2), Part);

- Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);

- State.set(VPDef, Sel, Part);

- addMetadata(Sel, &I);

- }

void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

@@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

!TheLoop->isLoopInvariant(V);

};

- auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {

- if (!isa<PHINode>(Ptr) ||

- !Legal->getInductionVars().count(cast<PHINode>(Ptr)))

- return false;

- auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];

- if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)

- return false;

- return isScalarUse(MemAccess, Ptr);

- };

- // A helper that evaluates a memory access's use of a pointer. If the

- // pointer is actually the pointer induction of a loop, it is being

- // inserted into Worklist. If the use will be a scalar use, and the

- // pointer is only used by memory accesses, we place the pointer in

- // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.

+ // A helper that evaluates a memory access's use of a pointer. If the use will

+ // be a scalar use and the pointer is only used by memory accesses, we place

+ // the pointer in ScalarPtrs. Otherwise, the pointer is placed in

+ // PossibleNonScalarPtrs.

auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {

- if (isScalarPtrInduction(MemAccess, Ptr)) {

- Worklist.insert(cast<Instruction>(Ptr));

- LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr

- << "\n");

- Instruction *Update = cast<Instruction>(

- cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));

- // If there is more than one user of Update (Ptr), we shouldn't assume it

- // will be scalar after vectorisation as other users of the instruction

- // may require widening. Otherwise, add it to ScalarPtrs.

- if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {

- ScalarPtrs.insert(Update);

- return;

- }

// We only care about bitcast and getelementptr instructions contained in

// the loop.

if (!isLoopVaryingBitCastOrGEP(Ptr))

@@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())

continue;

+ // Returns true if \p Indvar is a pointer induction that is used directly by

+ // load/store instruction \p I.

+ auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,

+ Instruction *I) {

+ return Induction.second.getKind() ==

+ InductionDescriptor::IK_PtrInduction &&

+ (isa<LoadInst>(I) || isa<StoreInst>(I)) &&

+ Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);

+ };

// Determine if all users of the induction variable are scalar after

// vectorization.

auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {

auto *I = cast<Instruction>(U);

- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);

+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||

+ IsDirectLoadStoreFromPtrIndvar(Ind, I);

});

if (!ScalarInd)

continue;

@@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

auto ScalarIndUpdate =

llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {

auto *I = cast<Instruction>(U);

- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);

+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||

+ IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);

});

if (!ScalarIndUpdate)

continue;

@@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

unsigned AS = getLoadStoreAddressSpace(I);

Value *Ptr = getLoadStorePointerOperand(I);

Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

+ // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`

+ // that it is being called from this specific place.

// Figure out whether the access is strided and get the stride value

// if it's known in compile time

@@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(

InstructionCost BaseCost = TTI.getArithmeticReductionCost(

RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);

+ // For a call to the llvm.fmuladd intrinsic we need to add the cost of a

+ // normal fmul instruction to the cost of the fadd reduction.

+ if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)

+ BaseCost +=

+ TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);

// If we're using ordered reductions then we can just return the base cost

// here, since getArithmeticReductionCost calculates the full ordered

// reduction cost when FP reassociation is not allowed.

@@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,

return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);

}

case Instruction::Call: {

+ if (RecurrenceDescriptor::isFMulAddIntrinsic(I))

+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))

+ return *RedCost;

bool NeedToScalarize;

CallInst *CI = cast<CallInst>(I);

InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);

@@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,

State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();

State.TripCount = ILV.getOrCreateTripCount(nullptr);

State.CanonicalIV = ILV.Induction;

+ ILV.collectPoisonGeneratingRecipes(State);

ILV.printDebugTracesAtStart();

@@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {

void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {

DEBUG_WITH_TYPE(VerboseDebug, {

- dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";

+ dbgs() << "intermediate fn:\n"

+ << *OrigLoop->getHeader()->getParent() << "\n";

});

}

@@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {

void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {

DEBUG_WITH_TYPE(VerboseDebug, {

- dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";

+ dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";

});

}

@@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

Range);

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(

- [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);

+ [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },

+ Range);

// Even if the instruction is not marked as uniform, there are certain

// intrinsic calls that can be effectively treated as such, so we check for

@@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

if (VPBB)

VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);

else {

- Plan->setEntry(FirstVPBBForBB);

+ auto *TopRegion = new VPRegionBlock("vector loop");

+ TopRegion->setEntry(FirstVPBBForBB);

+ Plan->setEntry(TopRegion);

HeaderVPBB = FirstVPBBForBB;

}

VPBB = FirstVPBBForBB;

@@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

}

- assert(isa<VPBasicBlock>(Plan->getEntry()) &&

+ assert(isa<VPRegionBlock>(Plan->getEntry()) &&

!Plan->getEntry()->getEntryBasicBlock()->empty() &&

- "entry block must be set to a non-empty VPBasicBlock");

+ "entry block must be set to a VPRegionBlock having a non-empty entry "

+ "VPBasicBlock");

+ cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);

RecipeBuilder.fixHeaderPhis();

// ---------------------------------------------------------------------------

@@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

unsigned FirstOpId;

assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&

"Only min/max recurrences allowed for inloop reductions");

+ // Recognize a call to the llvm.fmuladd intrinsic.

+ bool IsFMulAdd = (Kind == RecurKind::FMulAdd);

+ assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&

+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");

if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&

"Expected to replace a VPWidenSelectSC");

FirstOpId = 1;

} else {

- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&

+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||

+ (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&

"Expected to replace a VPWidenSC");

FirstOpId = 0;

}

@@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

auto *CondOp = CM.foldTailByMasking()

? RecipeBuilder.createBlockInMask(R->getParent(), Plan)

: nullptr;

- VPReductionRecipe *RedRecipe = new VPReductionRecipe(

- &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);

+ if (IsFMulAdd) {

+ // If the instruction is a call to the llvm.fmuladd intrinsic then we

+ // need to create an fmul recipe to use as the vector operand for the

+ // fadd reduction.

+ VPInstruction *FMulRecipe = new VPInstruction(

+ Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});

+ FMulRecipe->setFastMathFlags(R->getFastMathFlags());

+ WidenRecipe->getParent()->insert(FMulRecipe,

+ WidenRecipe->getIterator());

+ VecOp = FMulRecipe;

+ }

+ VPReductionRecipe *RedRecipe =

+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);

WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);

Plan->removeVPValueFor(R);

Plan->addVPValue(R, RedRecipe);

@@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {

}

void VPWidenSelectRecipe::execute(VPTransformState &State) {

- State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),

- this, *this, InvariantCond, State);

+ auto &I = *cast<SelectInst>(getUnderlyingInstr());

+ State.ILV->setDebugLocFromInst(&I);

+ // The condition can be loop invariant but still defined inside the

+ // loop. This means that we can't just use the original 'cond' value.

+ // We have to take the 'vectorized' value and pick the first lane.

+ // Instcombine will make this a no-op.

+ auto *InvarCond =

+ InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);

+ Value *Op0 = State.get(getOperand(1), Part);

+ Value *Op1 = State.get(getOperand(2), Part);

+ Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);

+ State.set(this, Sel, Part);

+ State.ILV->addMetadata(Sel, &I);

+ }

}

void VPWidenRecipe::execute(VPTransformState &State) {

- State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);

+ auto &I = *cast<Instruction>(getUnderlyingValue());

+ auto &Builder = State.Builder;

+ switch (I.getOpcode()) {

+ case Instruction::Call:

+ case Instruction::Br:

+ case Instruction::PHI:

+ case Instruction::GetElementPtr:

+ case Instruction::Select:

+ llvm_unreachable("This instruction is handled by a different recipe.");

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

+ case Instruction::URem:

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::FNeg:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::FDiv:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor: {

+ // Just widen unops and binops.

+ State.ILV->setDebugLocFromInst(&I);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ SmallVector<Value *, 2> Ops;

+ for (VPValue *VPOp : operands())

+ Ops.push_back(State.get(VPOp, Part));

+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

+ if (auto *VecOp = dyn_cast<Instruction>(V)) {

+ VecOp->copyIRFlags(&I);

+ // If the instruction is vectorized and was in a basic block that needed

+ // predication, we can't propagate poison-generating flags (nuw/nsw,

+ // exact, etc.). The control flow has been linearized and the

+ // instruction is no longer guarded by the predicate, which could make

+ // the flag properties to no longer hold.

+ if (State.MayGeneratePoisonRecipes.count(this) > 0)

+ VecOp->dropPoisonGeneratingFlags();

+ }

+ // Use this vector value for all users of the original instruction.

+ State.set(this, V, Part);

+ State.ILV->addMetadata(V, &I);

+ }

+ break;

+ }

+ case Instruction::ICmp:

+ case Instruction::FCmp: {

+ // Widen compares. Generate vector compares.

+ bool FCmp = (I.getOpcode() == Instruction::FCmp);

+ auto *Cmp = cast<CmpInst>(&I);

+ State.ILV->setDebugLocFromInst(Cmp);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *A = State.get(getOperand(0), Part);

+ Value *B = State.get(getOperand(1), Part);

+ Value *C = nullptr;

+ if (FCmp) {

+ // Propagate fast math flags.

+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);

+ Builder.setFastMathFlags(Cmp->getFastMathFlags());

+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);

+ } else {

+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);

+ }

+ State.set(this, C, Part);

+ State.ILV->addMetadata(C, &I);

+ }

+ break;

+ }

+ case Instruction::ZExt:

+ case Instruction::SExt:

+ case Instruction::FPToUI:

+ case Instruction::FPToSI:

+ case Instruction::FPExt:

+ case Instruction::PtrToInt:

+ case Instruction::IntToPtr:

+ case Instruction::SIToFP:

+ case Instruction::UIToFP:

+ case Instruction::Trunc:

+ case Instruction::FPTrunc:

+ case Instruction::BitCast: {

+ auto *CI = cast<CastInst>(&I);

+ State.ILV->setDebugLocFromInst(CI);

+ /// Vectorize casts.

+ Type *DestTy = (State.VF.isScalar())

+ ? CI->getType()

+ : VectorType::get(CI->getType(), State.VF);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *A = State.get(getOperand(0), Part);

+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);

+ State.set(this, Cast, Part);

+ State.ILV->addMetadata(Cast, &I);

+ }

+ break;

+ }

+ default:

+ // This instruction is not vectorized by simple widening.

+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);

+ llvm_unreachable("Unhandled instruction!");

+ } // end of switch.

}

void VPWidenGEPRecipe::execute(VPTransformState &State) {

- State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,

- *this, State.UF, State.VF, IsPtrLoopInvariant,

- IsIndexLoopInvariant, State);

+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());

+ // Construct a vector GEP by widening the operands of the scalar GEP as

+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

+ // results in a vector of pointers when at least one operand of the GEP

+ // is vector-typed. Thus, to keep the representation compact, we only use

+ // vector-typed operands for loop-varying values.

+ if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

+ // If we are vectorizing, but the GEP has only loop-invariant operands,

+ // the GEP we build (by only using vector-typed operands for

+ // loop-varying values) would be a scalar pointer. Thus, to ensure we

+ // produce a vector of pointers, we need to either arbitrarily pick an

+ // operand to broadcast, or broadcast a clone of the original GEP.

+ // Here, we broadcast a clone of the original.

+ //

+ // TODO: If at some point we decide to scalarize instructions having

+ // loop-invariant operands, this special case will no longer be

+ // required. We would add the scalarization decision to

+ // collectLoopScalars() and teach getVectorValue() to broadcast

+ // the lane-zero scalar value.

+ auto *Clone = State.Builder.Insert(GEP->clone());

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);

+ State.set(this, EntryPart, Part);

+ State.ILV->addMetadata(EntryPart, GEP);

+ }

+ } else {

+ // If the GEP has at least one loop-varying operand, we are sure to

+ // produce a vector of pointers. But if we are only unrolling, we want

+ // to produce a scalar GEP for each unroll part. Thus, the GEP we

+ // produce with the code below will be scalar (if VF == 1) or vector

+ // (otherwise). Note that for the unroll-only case, we still maintain

+ // values in the vector mapping with initVector, as we do for other

+ // instructions.

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ // The pointer operand of the new GEP. If it's loop-invariant, we

+ // won't broadcast it.

+ auto *Ptr = IsPtrLoopInvariant

+ ? State.get(getOperand(0), VPIteration(0, 0))

+ : State.get(getOperand(0), Part);

+ // Collect all the indices for the new GEP. If any index is

+ // loop-invariant, we won't broadcast it.

+ SmallVector<Value *, 4> Indices;

+ for (unsigned I = 1, E = getNumOperands(); I < E; I++) {

+ VPValue *Operand = getOperand(I);

+ if (IsIndexLoopInvariant[I - 1])

+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));

+ else

+ Indices.push_back(State.get(Operand, Part));

+ }

+ // If the GEP instruction is vectorized and was in a basic block that

+ // needed predication, we can't propagate the poison-generating 'inbounds'

+ // flag. The control flow has been linearized and the GEP is no longer

+ // guarded by the predicate, which could make the 'inbounds' properties to

+ // no longer hold.

+ bool IsInBounds =

+ GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;

+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

+ // but it should be a vector, otherwise.

+ auto *NewGEP = IsInBounds

+ ? State.Builder.CreateInBoundsGEP(

+ GEP->getSourceElementType(), Ptr, Indices)

+ : State.Builder.CreateGEP(GEP->getSourceElementType(),

+ Ptr, Indices);

+ assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&

+ "NewGEP is not a pointer vector");

+ State.set(this, NewGEP, Part);

+ State.ILV->addMetadata(NewGEP, GEP);

+ }

}

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

@@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {

void VPReplicateRecipe::execute(VPTransformState &State) {

if (State.Instance) { // Generate a single instance.

assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");

- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,

- *State.Instance, IsPredicated, State);

+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,

+ IsPredicated, State);

// Insert scalar instance packing it into a vector.

if (AlsoPack && State.VF.isVector()) {

// If we're constructing lane 0, initialize to start from poison.

@@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {

"Can't scalarize a scalable vector");

for (unsigned Part = 0; Part < State.UF; ++Part)

for (unsigned Lane = 0; Lane < EndLane; ++Lane)

- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,

+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,

VPIteration(Part, Lane), IsPredicated,

State);

}

@@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;

- State.ILV->vectorizeMemoryInstruction(

- &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),

- StoredValue, getMask(), Consecutive, Reverse);

+ // Attempt to issue a wide load.

+ LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);

+ StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);

+ assert((LI || SI) && "Invalid Load/Store instruction");

+ assert((!SI || StoredValue) && "No stored value provided for widened store");

+ assert((!LI || !StoredValue) && "Stored value provided for widened load");

+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);

+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);

+ const Align Alignment = getLoadStoreAlignment(&Ingredient);

+ bool CreateGatherScatter = !Consecutive;

+ auto &Builder = State.Builder;

+ InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);

+ bool isMaskRequired = getMask();

+ if (isMaskRequired)

+ for (unsigned Part = 0; Part < State.UF; ++Part)

+ BlockInMaskParts[Part] = State.get(getMask(), Part);

+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

+ // Calculate the pointer for the specific unroll-part.

+ GetElementPtrInst *PartPtr = nullptr;

+ bool InBounds = false;

+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

+ InBounds = gep->isInBounds();

+ if (Reverse) {

+ // If the address is consecutive but reversed, then the

+ // wide store needs to start at the last vector element.

+ // RunTimeVF = VScale * VF.getKnownMinValue()

+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()

+ Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);

+ // NumElt = -Part * RunTimeVF

+ Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);

+ // LastLane = 1 - RunTimeVF

+ Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);

+ PartPtr =

+ cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));

+ PartPtr->setIsInBounds(InBounds);

+ PartPtr = cast<GetElementPtrInst>(

+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));

+ PartPtr->setIsInBounds(InBounds);

+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

+ BlockInMaskParts[Part] =

+ Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");

+ } else {

+ Value *Increment =

+ createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);

+ PartPtr = cast<GetElementPtrInst>(

+ Builder.CreateGEP(ScalarDataTy, Ptr, Increment));

+ PartPtr->setIsInBounds(InBounds);

+ }

+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

+ };

+ // Handle Stores:

+ if (SI) {

+ State.ILV->setDebugLocFromInst(SI);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Instruction *NewSI = nullptr;

+ Value *StoredVal = State.get(StoredValue, Part);

+ if (CreateGatherScatter) {

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(getAddr(), Part);

+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,

+ MaskPart);

+ } else {

+ if (Reverse) {

+ // If we store to reverse consecutive memory locations, then we need

+ // to reverse the order of elements in the stored value.

+ StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");

+ // We don't want to update the value in the map as it might be used in

+ // another expression. So don't call resetVectorValue(StoredVal).

+ }

+ auto *VecPtr =

+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));

+ if (isMaskRequired)

+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,

+ BlockInMaskParts[Part]);

+ else

+ NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);

+ }

+ State.ILV->addMetadata(NewSI, SI);

+ }

+ return;

+ }

+ // Handle loads.

+ assert(LI && "Must have a load instruction");

+ State.ILV->setDebugLocFromInst(LI);

+ for (unsigned Part = 0; Part < State.UF; ++Part) {

+ Value *NewLI;

+ if (CreateGatherScatter) {

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(getAddr(), Part);

+ NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,

+ nullptr, "wide.masked.gather");

+ State.ILV->addMetadata(NewLI, LI);

+ } else {

+ auto *VecPtr =

+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));

+ if (isMaskRequired)

+ NewLI = Builder.CreateMaskedLoad(

+ DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

+ PoisonValue::get(DataTy), "wide.masked.load");

+ else

+ NewLI =

+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

+ // Add metadata to the load, but setVectorValue to the reverse shuffle.

+ State.ILV->addMetadata(NewLI, LI);

+ if (Reverse)

+ NewLI = Builder.CreateVectorReverse(NewLI, "reverse");

+ }

+ State.set(getVPSingleValue(), NewLI, Part);

+ }

}

// Determine how to lower the scalar epilogue, which depends on 1) optimising

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3ef0b794f68..95061e9053fa 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) {

return false;

}

+/// Checks if the given value is actually an undefined constant vector.

+static bool isUndefVector(const Value *V) {

+ if (isa<UndefValue>(V))

+ return true;

+ auto *C = dyn_cast<Constant>(V);

+ if (!C)

+ return false;

+ if (!C->containsUndefOrPoisonElement())

+ return false;

+ auto *VecTy = dyn_cast<FixedVectorType>(C->getType());

+ if (!VecTy)

+ return false;

+ for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {

+ if (Constant *Elem = C->getAggregateElement(I))

+ if (!isa<UndefValue>(Elem))

+ return false;

+ }

+ return true;

/// Checks if the vector of instructions can be represented as a shuffle, like:

/// %x0 = extractelement <4 x i8> %x, i32 0

/// %x3 = extractelement <4 x i8> %x, i32 3

@@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) {

/// TargetTransformInfo::getInstructionThroughput?

static Optional<TargetTransformInfo::ShuffleKind>

isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

- auto *EI0 = cast<ExtractElementInst>(VL[0]);

+ const auto *It =

+ find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });

+ if (It == VL.end())

+ return None;

+ auto *EI0 = cast<ExtractElementInst>(*It);

if (isa<ScalableVectorType>(EI0->getVectorOperandType()))

return None;

unsigned Size =

@@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

Value *Vec2 = nullptr;

enum ShuffleMode { Unknown, Select, Permute };

ShuffleMode CommonShuffleMode = Unknown;

+ Mask.assign(VL.size(), UndefMaskElem);

for (unsigned I = 0, E = VL.size(); I < E; ++I) {

+ // Undef can be represented as an undef element in a vector.

+ if (isa<UndefValue>(VL[I]))

+ continue;

auto *EI = cast<ExtractElementInst>(VL[I]);

+ if (isa<ScalableVectorType>(EI->getVectorOperandType()))

+ return None;

auto *Vec = EI->getVectorOperand();

+ // We can extractelement from undef or poison vector.

+ if (isUndefVector(Vec))

+ continue;

// All vector operands must have the same number of vector elements.

if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)

return None;

+ if (isa<UndefValue>(EI->getIndexOperand()))

+ continue;

auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());

if (!Idx)

return None;

// Undefined behavior if Idx is negative or >= Size.

- if (Idx->getValue().uge(Size)) {

- Mask.push_back(UndefMaskElem);

+ if (Idx->getValue().uge(Size))

continue;

- }

unsigned IntIdx = Idx->getValue().getZExtValue();

- Mask.push_back(IntIdx);

- // We can extractelement from undef or poison vector.

- if (isa<UndefValue>(Vec))

- continue;

+ Mask[I] = IntIdx;

// For correct shuffling we have to have at most 2 different vector operands

// in all extractelement instructions.

- if (!Vec1 || Vec1 == Vec)

+ if (!Vec1 || Vec1 == Vec) {

Vec1 = Vec;

- else if (!Vec2 || Vec2 == Vec)

+ } else if (!Vec2 || Vec2 == Vec) {

Vec2 = Vec;

- else

+ Mask[I] += Size;

+ } else {

return None;

+ }

if (CommonShuffleMode == Permute)

continue;

// If the extract index is not the same as the operation number, it is a

@@ -1680,6 +1712,28 @@ private:

return IsSame(Scalars, ReuseShuffleIndices);

}

+ /// \returns true if current entry has same operands as \p TE.

+ bool hasEqualOperands(const TreeEntry &TE) const {

+ if (TE.getNumOperands() != getNumOperands())

+ return false;

+ SmallBitVector Used(getNumOperands());

+ for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {

+ unsigned PrevCount = Used.count();

+ for (unsigned K = 0; K < E; ++K) {

+ if (Used.test(K))

+ continue;

+ if (getOperand(K) == TE.getOperand(I)) {

+ Used.set(K);

+ break;

+ }

+ // Check if we actually found the matching operand.

+ if (PrevCount == Used.count())

+ return false;

+ }

+ return true;

+ }

/// \return Final vectorization factor for the node. Defined by the total

/// number of vectorized scalars, including those, used several times in the

/// entry and counted in the \a ReuseShuffleIndices, if any.

@@ -1773,6 +1827,12 @@ private:

return Operands[OpIdx];

}

+ /// \returns the \p OpIdx operand of this TreeEntry.

+ ArrayRef<Value *> getOperand(unsigned OpIdx) const {

+ assert(OpIdx < Operands.size() && "Off bounds");

+ return Operands[OpIdx];

+ }

/// \returns the number of operands.

unsigned getNumOperands() const { return Operands.size(); }

@@ -2078,7 +2138,7 @@ private:

SmallPtrSet<const Value *, 32> EphValues;

/// Holds all of the instructions that we gathered.

- SetVector<Instruction *> GatherSeq;

+ SetVector<Instruction *> GatherShuffleSeq;

/// A list of blocks that we are going to CSE.

SetVector<BasicBlock *> CSEBlocks;

@@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

bool IsGather) {

DenseMap<Value *, int> ExtractVectorsTys;

for (auto *V : VL) {

+ if (isa<UndefValue>(V))

+ continue;

// If all users of instruction are going to be vectorized and this

// instruction itself is not going to be vectorized, consider this

// instruction as dead and remove its cost from the final cost of the

// vectorized tree.

- if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

- (IsGather && ScalarToTreeEntry.count(V)))

+ if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))

continue;

auto *EE = cast<ExtractElementInst>(V);

- unsigned Idx = *getExtractIndex(EE);

+ Optional<unsigned> EEIdx = getExtractIndex(EE);

+ if (!EEIdx)

+ continue;

+ unsigned Idx = *EEIdx;

if (TTIRef.getNumberOfParts(VecTy) !=

TTIRef.getNumberOfParts(EE->getVectorOperandType())) {

auto It =

@@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

for (const auto &Data : ExtractVectorsTys) {

auto *EEVTy = cast<FixedVectorType>(Data.first->getType());

unsigned NumElts = VecTy->getNumElements();

+ if (Data.second % NumElts == 0)

+ continue;

if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {

unsigned Idx = (Data.second / NumElts) * NumElts;

unsigned EENumElts = EEVTy->getNumElements();

@@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// broadcast.

return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);

}

- if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&

- allSameBlock(VL) &&

- !isa<ScalableVectorType>(

- cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) {

+ if ((E->getOpcode() == Instruction::ExtractElement ||

+ all_of(E->Scalars,

+ [](Value *V) {

+ return isa<ExtractElementInst, UndefValue>(V);

+ })) &&

+ allSameType(VL)) {

// Check that gather of extractelements can be represented as just a

// shuffle of a single/two vectors the scalars are extracted from.

SmallVector<int> Mask;

@@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

return !is_contained(E->Scalars,

cast<Instruction>(V)->getOperand(0));

}));

- if (isa<UndefValue>(FirstInsert->getOperand(0))) {

+ if (isUndefVector(FirstInsert->getOperand(0))) {

Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);

} else {

SmallVector<int> InsertMask(NumElts);

@@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

// VecCost is equal to sum of the cost of creating 2 vectors

// and the cost of creating shuffle.

InstructionCost VecCost = 0;

- if (Instruction::isBinaryOp(E->getOpcode())) {

+ // Try to find the previous shuffle node with the same operands and same

+ // main/alternate ops.

+ auto &&TryFindNodeWithEqualOperands = [this, E]() {

+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

+ if (TE.get() == E)

+ break;

+ if (TE->isAltShuffle() &&

+ ((TE->getOpcode() == E->getOpcode() &&

+ TE->getAltOpcode() == E->getAltOpcode()) ||

+ (TE->getOpcode() == E->getAltOpcode() &&

+ TE->getAltOpcode() == E->getOpcode())) &&

+ TE->hasEqualOperands(*E))

+ return true;

+ }

+ return false;

+ };

+ if (TryFindNodeWithEqualOperands()) {

+ LLVM_DEBUG({

+ dbgs() << "SLP: diamond match for alternate node found.\n";

+ E->dump();

+ });

+ // No need to add new vector costs here since we're going to reuse

+ // same main/alternate vector ops, just do different shuffling.

+ } else if (Instruction::isBinaryOp(E->getOpcode())) {

VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);

VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,

CostKind);

@@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

[this](Value *V) { return EphValues.contains(V); }) &&

(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||

TE->Scalars.size() < Limit ||

- (TE->getOpcode() == Instruction::ExtractElement &&

+ ((TE->getOpcode() == Instruction::ExtractElement ||

+ all_of(TE->Scalars,

+ [](Value *V) {

+ return isa<ExtractElementInst, UndefValue>(V);

+ })) &&

isFixedVectorShuffle(TE->Scalars, Mask)) ||

(TE->State == TreeEntry::NeedToGather &&

TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));

@@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {

return Cost;

}

+/// Check if two insertelement instructions are from the same buildvector.

+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,

+ InsertElementInst *V) {

+ // Instructions must be from the same basic blocks.

+ if (VU->getParent() != V->getParent())

+ return false;

+ // Checks if 2 insertelements are from the same buildvector.

+ if (VU->getType() != V->getType())

+ return false;

+ // Multiple used inserts are separate nodes.

+ if (!VU->hasOneUse() && !V->hasOneUse())

+ return false;

+ auto *IE1 = VU;

+ auto *IE2 = V;

+ // Go through the vector operand of insertelement instructions trying to find

+ // either VU as the original vector for IE2 or V as the original vector for

+ // IE1.

+ do {

+ if (IE2 == VU || IE1 == V)

+ return true;

+ if (IE1) {

+ if (IE1 != VU && !IE1->hasOneUse())

+ IE1 = nullptr;

+ else

+ IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));

+ }

+ if (IE2) {

+ if (IE2 != V && !IE2->hasOneUse())

+ IE2 = nullptr;

+ else

+ IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));

+ }

+ } while (IE1 || IE2);

+ return false;

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost Cost = 0;

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

@@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

SmallVector<APInt> DemandedElts;

for (ExternalUser &EU : ExternalUses) {

// We only add extract cost once for the same scalar.

- if (!ExtractCostCalculated.insert(EU.Scalar).second)

+ if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

+ !ExtractCostCalculated.insert(EU.Scalar).second)

continue;

// Uses by ephemeral values are free (because the ephemeral value will be

@@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// If found user is an insertelement, do not calculate extract cost but try

// to detect it as a final shuffled/identity match.

- if (isa_and_nonnull<InsertElementInst>(EU.User)) {

- if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {

- Optional<int> InsertIdx = getInsertIndex(EU.User, 0);

+ if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {

+ if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

+ Optional<int> InsertIdx = getInsertIndex(VU, 0);

if (!InsertIdx || *InsertIdx == UndefMaskElem)

continue;

- Value *VU = EU.User;

auto *It = find_if(FirstUsers, [VU](Value *V) {

- // Checks if 2 insertelements are from the same buildvector.

- if (VU->getType() != V->getType())

- return false;

- auto *IE1 = cast<InsertElementInst>(VU);

- auto *IE2 = cast<InsertElementInst>(V);

- // Go through of insertelement instructions trying to find either VU

- // as the original vector for IE2 or V as the original vector for IE1.

- do {

- if (IE1 == VU || IE2 == V)

- return true;

- if (IE1)

- IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));

- if (IE2)

- IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));

- } while (IE1 || IE2);

- return false;

+ return areTwoInsertFromSameBuildVector(VU,

+ cast<InsertElementInst>(V));

});

int VecId = -1;

if (It == FirstUsers.end()) {

VF.push_back(FTy->getNumElements());

ShuffleMask.emplace_back(VF.back(), UndefMaskElem);

- FirstUsers.push_back(EU.User);

+ // Find the insertvector, vectorized in tree, if any.

+ Value *Base = VU;

+ while (isa<InsertElementInst>(Base)) {

+ // Build the mask for the vectorized insertelement instructions.

+ if (const TreeEntry *E = getTreeEntry(Base)) {

+ VU = cast<InsertElementInst>(Base);

+ do {

+ int Idx = E->findLaneForValue(Base);

+ ShuffleMask.back()[Idx] = Idx;

+ Base = cast<InsertElementInst>(Base)->getOperand(0);

+ } while (E == getTreeEntry(Base));

+ break;

+ }

+ Base = cast<InsertElementInst>(Base)->getOperand(0);

+ }

+ FirstUsers.push_back(VU);

DemandedElts.push_back(APInt::getZero(VF.back()));

VecId = FirstUsers.size() - 1;

} else {

@@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

int Idx = *InsertIdx;

ShuffleMask[VecId][Idx] = EU.Lane;

DemandedElts[VecId].setBit(Idx);

+ continue;

}

@@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

InstructionCost SpillCost = getSpillCost();

Cost += SpillCost + ExtractCost;

- for (int I = 0, E = FirstUsers.size(); I < E; ++I) {

- // For the very first element - simple shuffle of the source vector.

- int Limit = ShuffleMask[I].size() * 2;

- if (I == 0 &&

- all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&

- !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {

+ if (FirstUsers.size() == 1) {

+ int Limit = ShuffleMask.front().size() * 2;

+ if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&

+ !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {

InstructionCost C = TTI->getShuffleCost(

TTI::SK_PermuteSingleSrc,

- cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);

+ cast<FixedVectorType>(FirstUsers.front()->getType()),

+ ShuffleMask.front());

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

<< " for final shuffle of insertelement external users "

<< *VectorizableTree.front()->Scalars.front() << ".\n"

<< "SLP: Current total cost = " << Cost << "\n");

Cost += C;

- continue;

}

- // Other elements - permutation of 2 vectors (the initial one and the next

- // Ith incoming vector).

- unsigned VF = ShuffleMask[I].size();

- for (unsigned Idx = 0; Idx < VF; ++Idx) {

- int &Mask = ShuffleMask[I][Idx];

- Mask = Mask == UndefMaskElem ? Idx : VF + Mask;

- }

- InstructionCost C = TTI->getShuffleCost(

- TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),

- ShuffleMask[I]);

- LLVM_DEBUG(

- dbgs()

- << "SLP: Adding cost " << C

- << " for final shuffle of vector node and external insertelement users "

- << *VectorizableTree.front()->Scalars.front() << ".\n"

- << "SLP: Current total cost = " << Cost << "\n");

- Cost += C;

InstructionCost InsertCost = TTI->getScalarizationOverhead(

- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],

- /*Insert*/ true,

- /*Extract*/ false);

+ cast<FixedVectorType>(FirstUsers.front()->getType()),

+ DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);

+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

+ << " for insertelements gather.\n"

+ << "SLP: Current total cost = " << Cost << "\n");

Cost -= InsertCost;

+ } else if (FirstUsers.size() >= 2) {

+ unsigned MaxVF = *std::max_element(VF.begin(), VF.end());

+ // Combined masks of the first 2 vectors.

+ SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);

+ copy(ShuffleMask.front(), CombinedMask.begin());

+ APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);

+ auto *VecTy = FixedVectorType::get(

+ cast<VectorType>(FirstUsers.front()->getType())->getElementType(),

+ MaxVF);

+ for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {

+ if (ShuffleMask[1][I] != UndefMaskElem) {

+ CombinedMask[I] = ShuffleMask[1][I] + MaxVF;

+ CombinedDemandedElts.setBit(I);

+ }

+ InstructionCost C =

+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for final shuffle of vector node and external "

+ "insertelement users "

+ << *VectorizableTree.front()->Scalars.front() << ".\n"

+ << "SLP: Current total cost = " << Cost << "\n");

+ Cost += C;

+ InstructionCost InsertCost = TTI->getScalarizationOverhead(

+ VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);

LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

<< " for insertelements gather.\n"

<< "SLP: Current total cost = " << Cost << "\n");

+ Cost -= InsertCost;

+ for (int I = 2, E = FirstUsers.size(); I < E; ++I) {

+ // Other elements - permutation of 2 vectors (the initial one and the

+ // next Ith incoming vector).

+ unsigned VF = ShuffleMask[I].size();

+ for (unsigned Idx = 0; Idx < VF; ++Idx) {

+ int Mask = ShuffleMask[I][Idx];

+ if (Mask != UndefMaskElem)

+ CombinedMask[Idx] = MaxVF + Mask;

+ else if (CombinedMask[Idx] != UndefMaskElem)

+ CombinedMask[Idx] = Idx;

+ }

+ for (unsigned Idx = VF; Idx < MaxVF; ++Idx)

+ if (CombinedMask[Idx] != UndefMaskElem)

+ CombinedMask[Idx] = Idx;

+ InstructionCost C =

+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);

+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

+ << " for final shuffle of vector node and external "

+ "insertelement users "

+ << *VectorizableTree.front()->Scalars.front() << ".\n"

+ << "SLP: Current total cost = " << Cost << "\n");

+ Cost += C;

+ InstructionCost InsertCost = TTI->getScalarizationOverhead(

+ cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],

+ /*Insert*/ true, /*Extract*/ false);

+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost

+ << " for insertelements gather.\n"

+ << "SLP: Current total cost = " << Cost << "\n");

+ Cost -= InsertCost;

+ }

}

#ifndef NDEBUG

@@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

auto *InsElt = dyn_cast<InsertElementInst>(Vec);

if (!InsElt)

return Vec;

- GatherSeq.insert(InsElt);

+ GatherShuffleSeq.insert(InsElt);

CSEBlocks.insert(InsElt->getParent());

// Add to our 'need-to-extract' list.

if (TreeEntry *Entry = getTreeEntry(V)) {

@@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder {

const unsigned VF = 0;

bool IsFinalized = false;

SmallVector<int, 4> Mask;

+ /// Holds all of the instructions that we gathered.

+ SetVector<Instruction *> &GatherShuffleSeq;

+ /// A list of blocks that we are going to CSE.

+ SetVector<BasicBlock *> &CSEBlocks;

public:

- ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)

- : Builder(Builder), VF(VF) {}

+ ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,

+ SetVector<Instruction *> &GatherShuffleSeq,

+ SetVector<BasicBlock *> &CSEBlocks)

+ : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),

+ CSEBlocks(CSEBlocks) {}

/// Adds a mask, inverting it before applying.

void addInversedMask(ArrayRef<unsigned> SubMask) {

@@ -5804,7 +5983,12 @@ public:

if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))

return V;

- return Builder.CreateShuffleVector(V, Mask, "shuffle");

+ Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");

+ if (auto *I = dyn_cast<Instruction>(Vec)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

+ return Vec;

}

~ShuffleInstructionBuilder() {

@@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

std::iota(UniformMask.begin(), UniformMask.end(), 0);

V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");

}

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

}

return V;

}

@@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

VL = UniqueValues;

}

- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);

+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,

+ CSEBlocks);

Value *Vec = gather(VL);

if (!ReuseShuffleIndicies.empty()) {

ShuffleBuilder.addMask(ReuseShuffleIndicies);

Vec = ShuffleBuilder.finalize(Vec);

- if (auto *I = dyn_cast<Instruction>(Vec)) {

- GatherSeq.insert(I);

- CSEBlocks.insert(I->getParent());

- }

}

return Vec;

}

@@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

unsigned VF = E->getVectorFactor();

- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);

+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,

+ CSEBlocks);

if (E->State == TreeEntry::NeedToGather) {

if (E->getMainOp())

setInsertPointAfterBundle(E);

@@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

"Expected shuffle of 1 or 2 entries.");

Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,

Entries.back()->VectorizedValue, Mask);

+ if (auto *I = dyn_cast<Instruction>(Vec)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

} else {

Vec = gather(E->Scalars);

}

if (NeedToShuffleReuses) {

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

Vec = ShuffleBuilder.finalize(Vec);

- if (auto *I = dyn_cast<Instruction>(Vec)) {

- GatherSeq.insert(I);

- CSEBlocks.insert(I->getParent());

- }

}

E->VectorizedValue = Vec;

return Vec;

@@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

IsIdentity &= *InsertIdx - Offset == I;

Mask[*InsertIdx - Offset] = I;

}

- if (!IsIdentity || NumElts != NumScalars)

+ if (!IsIdentity || NumElts != NumScalars) {

V = Builder.CreateShuffleVector(V, Mask);

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

if ((!IsIdentity || Offset != 0 ||

- !isa<UndefValue>(FirstInsert->getOperand(0))) &&

+ !isUndefVector(FirstInsert->getOperand(0))) &&

NumElts != NumScalars) {

SmallVector<int> InsertMask(NumElts);

std::iota(InsertMask.begin(), InsertMask.end(), 0);

@@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

V = Builder.CreateShuffleVector(

FirstInsert->getOperand(0), V, InsertMask,

cast<Instruction>(E->Scalars.back())->getName());

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

}

++NumVectorInstructions;

@@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

V1 = Builder.CreateCast(

static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

}

+ // Add V0 and V1 to later analysis to try to find and remove matching

+ // instruction, if any.

+ for (Value *V : {V0, V1}) {

+ if (auto *I = dyn_cast<Instruction>(V)) {

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

// Create shuffle to take alternate operations from the vector.

// Also, gather up main and alt scalar ops to propagate IR flags to

@@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

propagateIRFlags(V1, AltScalars);

Value *V = Builder.CreateShuffleVector(V0, V1, Mask);

- if (Instruction *I = dyn_cast<Instruction>(V))

+ if (auto *I = dyn_cast<Instruction>(V)) {

V = propagateMetadata(I, E->Scalars);

+ GatherShuffleSeq.insert(I);

+ CSEBlocks.insert(I->getParent());

+ }

V = ShuffleBuilder.finalize(V);

E->VectorizedValue = V;

@@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

}

void BoUpSLP::optimizeGatherSequence() {

- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()

+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()

<< " gather sequences instructions.\n");

// LICM InsertElementInst sequences.

- for (Instruction *I : GatherSeq) {

+ for (Instruction *I : GatherShuffleSeq) {

if (isDeleted(I))

continue;

@@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() {

// If the vector or the element that we insert into it are

// instructions that are defined in this basic block then we can't

// hoist this instruction.

- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));

- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));

- if (Op0 && L->contains(Op0))

- continue;

- if (Op1 && L->contains(Op1))

+ if (any_of(I->operands(), [L](Value *V) {

+ auto *OpI = dyn_cast<Instruction>(V);

+ return OpI && L->contains(OpI);

+ }))

continue;

// We can hoist this instruction. Move it to the pre-header.

@@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() {

return A->getDFSNumIn() < B->getDFSNumIn();

});

- // Perform O(N^2) search over the gather sequences and merge identical

+ // Less defined shuffles can be replaced by the more defined copies.

+ // Between two shuffles one is less defined if it has the same vector operands

+ // and its mask indeces are the same as in the first one or undefs. E.g.

+ // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,

+ // poison, <0, 0, 0, 0>.

+ auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,

+ SmallVectorImpl<int> &NewMask) {

+ if (I1->getType() != I2->getType())

+ return false;

+ auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);

+ auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);

+ if (!SI1 || !SI2)

+ return I1->isIdenticalTo(I2);

+ if (SI1->isIdenticalTo(SI2))

+ return true;

+ for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)

+ if (SI1->getOperand(I) != SI2->getOperand(I))

+ return false;

+ // Check if the second instruction is more defined than the first one.

+ NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());

+ ArrayRef<int> SM1 = SI1->getShuffleMask();

+ // Count trailing undefs in the mask to check the final number of used

+ // registers.

+ unsigned LastUndefsCnt = 0;

+ for (int I = 0, E = NewMask.size(); I < E; ++I) {

+ if (SM1[I] == UndefMaskElem)

+ ++LastUndefsCnt;

+ else

+ LastUndefsCnt = 0;

+ if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&

+ NewMask[I] != SM1[I])

+ return false;

+ if (NewMask[I] == UndefMaskElem)

+ NewMask[I] = SM1[I];

+ }

+ // Check if the last undefs actually change the final number of used vector

+ // registers.

+ return SM1.size() - LastUndefsCnt > 1 &&

+ TTI->getNumberOfParts(SI1->getType()) ==

+ TTI->getNumberOfParts(

+ FixedVectorType::get(SI1->getType()->getElementType(),

+ SM1.size() - LastUndefsCnt));

+ };

+ // Perform O(N^2) search over the gather/shuffle sequences and merge identical

// instructions. TODO: We can further optimize this scan if we split the

// instructions into different buckets based on the insert lane.

SmallVector<Instruction *, 16> Visited;

@@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() {

if (isDeleted(&In))

continue;

if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&

- !isa<ShuffleVectorInst>(&In))

+ !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))

continue;

// Check if we can replace this instruction with any of the

// visited instructions.

bool Replaced = false;

- for (Instruction *v : Visited) {

- if (In.isIdenticalTo(v) &&

- DT->dominates(v->getParent(), In.getParent())) {

- In.replaceAllUsesWith(v);

+ for (Instruction *&V : Visited) {

+ SmallVector<int> NewMask;

+ if (IsIdenticalOrLessDefined(&In, V, NewMask) &&

+ DT->dominates(V->getParent(), In.getParent())) {

+ In.replaceAllUsesWith(V);

eraseInstruction(&In);

+ if (auto *SI = dyn_cast<ShuffleVectorInst>(V))

+ if (!NewMask.empty())

+ SI->setShuffleMask(NewMask);

+ Replaced = true;

+ break;

+ }

+ if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&

+ GatherShuffleSeq.contains(V) &&

+ IsIdenticalOrLessDefined(V, &In, NewMask) &&

+ DT->dominates(In.getParent(), V->getParent())) {

+ In.moveAfter(V);

+ V->replaceAllUsesWith(&In);

+ eraseInstruction(V);

+ if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))

+ if (!NewMask.empty())

+ SI->setShuffleMask(NewMask);

+ V = &In;

Replaced = true;

break;

}

@@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() {

}

CSEBlocks.clear();

- GatherSeq.clear();

+ GatherShuffleSeq.clear();

}

// Groups the instructions to a bundle (which is then a single scheduling entity)

@@ -8791,6 +9057,8 @@ private:

assert(VectorizedValue && "Need to have a vectorized tree node");

assert(isPowerOf2_32(ReduxWidth) &&

"We only handle power-of-two reductions for now");

+ assert(RdxKind != RecurKind::FMulAdd &&

+ "A call to the llvm.fmuladd intrinsic is not handled yet");

++NumVectorInstructions;

return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,

@@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

SmallVector<Value *, 16> BuildVectorOpds;

SmallVector<int> Mask;

if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||

- (llvm::all_of(BuildVectorOpds,

- [](Value *V) { return isa<ExtractElementInst>(V); }) &&

+ (llvm::all_of(

+ BuildVectorOpds,

+ [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&

isFixedVectorShuffle(BuildVectorOpds, Mask)))

return false;

@@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

return tryToVectorizeList(BuildVectorInsts, R);

}

-bool SLPVectorizerPass::vectorizeSimpleInstructions(

- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,

- bool AtTerminator) {

- bool OpsChanged = false;

- SmallVector<Instruction *, 4> PostponedCmps;

- for (auto *I : reverse(Instructions)) {

- if (R.isDeleted(I))

- continue;

- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))

- OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);

- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))

- OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);

- else if (isa<CmpInst>(I))

- PostponedCmps.push_back(I);

- }

- if (AtTerminator) {

- // Try to find reductions first.

- for (Instruction *I : PostponedCmps) {

- if (R.isDeleted(I))

- continue;

- for (Value *Op : I->operands())

- OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);

- }

- // Try to vectorize operands as vector bundles.

- for (Instruction *I : PostponedCmps) {

- if (R.isDeleted(I))

- continue;

- OpsChanged |= tryToVectorize(I, R);

- }

- Instructions.clear();

- } else {

- // Insert in reverse order since the PostponedCmps vector was filled in

- // reverse order.

- Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());

- }

- return OpsChanged;

template <typename T>

static bool

tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

@@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

return Changed;

}

+bool SLPVectorizerPass::vectorizeSimpleInstructions(

+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,

+ bool AtTerminator) {

+ bool OpsChanged = false;

+ SmallVector<Instruction *, 4> PostponedCmps;

+ for (auto *I : reverse(Instructions)) {

+ if (R.isDeleted(I))

+ continue;

+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))

+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);

+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))

+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);

+ else if (isa<CmpInst>(I))

+ PostponedCmps.push_back(I);

+ }

+ if (AtTerminator) {

+ // Try to find reductions first.

+ for (Instruction *I : PostponedCmps) {

+ if (R.isDeleted(I))

+ continue;

+ for (Value *Op : I->operands())

+ OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);

+ }

+ // Try to vectorize operands as vector bundles.

+ for (Instruction *I : PostponedCmps) {

+ if (R.isDeleted(I))

+ continue;

+ OpsChanged |= tryToVectorize(I, R);

+ }

+ // Try to vectorize list of compares.

+ // Sort by type, compare predicate, etc.

+ // TODO: Add analysis on the operand opcodes (profitable to vectorize

+ // instructions with same/alternate opcodes/const values).

+ auto &&CompareSorter = [&R](Value *V, Value *V2) {

+ auto *CI1 = cast<CmpInst>(V);

+ auto *CI2 = cast<CmpInst>(V2);

+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

+ return false;

+ if (CI1->getOperand(0)->getType()->getTypeID() <

+ CI2->getOperand(0)->getType()->getTypeID())

+ return true;

+ if (CI1->getOperand(0)->getType()->getTypeID() >

+ CI2->getOperand(0)->getType()->getTypeID())

+ return false;

+ return CI1->getPredicate() < CI2->getPredicate() ||

+ (CI1->getPredicate() > CI2->getPredicate() &&

+ CI1->getPredicate() <

+ CmpInst::getSwappedPredicate(CI2->getPredicate()));

+ };

+ auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {

+ if (V1 == V2)

+ return true;

+ auto *CI1 = cast<CmpInst>(V1);

+ auto *CI2 = cast<CmpInst>(V2);

+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))

+ return false;

+ if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())

+ return false;

+ return CI1->getPredicate() == CI2->getPredicate() ||

+ CI1->getPredicate() ==

+ CmpInst::getSwappedPredicate(CI2->getPredicate());

+ };

+ auto Limit = [&R](Value *V) {

+ unsigned EltSize = R.getVectorElementSize(V);

+ return std::max(2U, R.getMaxVecRegSize() / EltSize);

+ };

+ SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());

+ OpsChanged |= tryToVectorizeSequence<Value>(

+ Vals, Limit, CompareSorter, AreCompatibleCompares,

+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

+ // Exclude possible reductions from other blocks.

+ bool ArePossiblyReducedInOtherBlock =

+ any_of(Candidates, [](Value *V) {

+ return any_of(V->users(), [V](User *U) {

+ return isa<SelectInst>(U) &&

+ cast<SelectInst>(U)->getParent() !=

+ cast<Instruction>(V)->getParent();

+ });

+ if (ArePossiblyReducedInOtherBlock)

+ return false;

+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

+ },

+ /*LimitForRegisterSize=*/true);

+ Instructions.clear();

+ } else {

+ // Insert in reverse order since the PostponedCmps vector was filled in

+ // reverse order.

+ Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());

+ }

+ return OpsChanged;

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

bool Changed = false;

SmallVector<Value *, 4> Incoming;

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 638467f94e1c..44b5e1df0839 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State,

void VPInstruction::execute(VPTransformState &State) {

assert(!State.Instance && "VPInstruction executing an Instance");

+ IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);

+ State.Builder.setFastMathFlags(FMF);

for (unsigned Part = 0; Part < State.UF; ++Part)

generateInstruction(State, Part);

}

@@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

O << Instruction::getOpcodeName(getOpcode());

}

+ O << FMF;

for (const VPValue *Operand : operands()) {

O << " ";

Operand->printAsOperand(O, SlotTracker);

@@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,

}

#endif

+void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {

+ // Make sure the VPInstruction is a floating-point operation.

+ assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||

+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||

+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||

+ Opcode == Instruction::FCmp) &&

+ "this op can't take fast-math flags");

+ FMF = FMFNew;

/// Generate the code inside the body of the vectorized loop. Assumes a single

/// LoopVectorBody basic-block was created for this. Introduce additional

/// basic-blocks as needed, and fill them all.

@@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,

printAsOperand(O, SlotTracker);

O << " = ";

getChainOp()->printAsOperand(O, SlotTracker);

- O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode())

- << " (";

+ O << " +";

+ if (isa<FPMathOperator>(getUnderlyingInstr()))

+ O << getUnderlyingInstr()->getFastMathFlags();

+ O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";

getVecOp()->printAsOperand(O, SlotTracker);

if (getCondOp()) {

O << ", ";

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 00ee31007cb7..810dd5030f95 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -59,6 +59,7 @@ class Value;

class VPBasicBlock;

class VPRegionBlock;

class VPlan;

+class VPReplicateRecipe;

class VPlanSlp;

/// Returns a calculation for the total number of elements for a given \p VF.

@@ -346,6 +347,10 @@ struct VPTransformState {

/// Pointer to the VPlan code is generated for.

VPlan *Plan;

+ /// Holds recipes that may generate a poison value that is used after

+ /// vectorization, even when their operands are not poison.

+ SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;

};

/// VPUsers instance used by VPBlockBase to manage CondBit and the block

@@ -789,6 +794,7 @@ public:

private:

typedef unsigned char OpcodeTy;

OpcodeTy Opcode;

+ FastMathFlags FMF;

/// Utility method serving execute(): generates a single instance of the

/// modeled instruction.

@@ -802,13 +808,6 @@ public:

: VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),

VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}

- VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)

- : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}),

- VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {

- for (auto *I : Operands)

- addOperand(I->getVPSingleValue());

- }

VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)

: VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}

@@ -870,6 +869,9 @@ public:

return true;

}

+ /// Set the fast-math flags.

+ void setFastMathFlags(FastMathFlags FMFNew);

};

/// VPWidenRecipe is a recipe for producing a copy of vector type its

@@ -1511,7 +1513,7 @@ public:

/// - For store: Address, stored value, optional mask

/// TODO: We currently execute only per-part unless a specific instance is

/// provided.

-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {

+class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {

Instruction &Ingredient;

// Whether the loaded-from / stored-to addresses are consecutive.

@@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {

public:

VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,

bool Consecutive, bool Reverse)

- : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),

+ : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),

+ VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),

Consecutive(Consecutive), Reverse(Reverse) {

assert((Consecutive || !Reverse) && "Reverse implies consecutive");

- new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);

setMask(Mask);

}

@@ -1544,6 +1546,7 @@ public:

VPValue *StoredValue, VPValue *Mask,

bool Consecutive, bool Reverse)

: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),

+ VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),

Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {

assert((Consecutive || !Reverse) && "Reverse implies consecutive");

setMask(Mask);