aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-02 21:02:54 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-12-02 21:02:54 +0000
commitf65dcba83ce5035ab88a85fe17628b447eb56e1b (patch)
tree35f37bb72b3cfc6060193e66c76ee7c9478969b0 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent846a2208a8ab099f595fe7e8b2e6d54a7b5e67fb (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp980
1 files changed, 527 insertions, 453 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23bb6f0860c9..5ca0adb4242c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -473,18 +473,10 @@ public:
/// handle the more complex control flow around the loops.
virtual BasicBlock *createVectorizedLoopSkeleton();
- /// Widen a single instruction within the innermost loop.
- void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
- VPTransformState &State);
-
/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);
- /// Widen a single select instruction within the innermost loop.
- void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
- bool InvariantCond, VPTransformState &State);
-
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State);
@@ -496,12 +488,6 @@ public:
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
- /// Vectorize a single GetElementPtrInst based on information gathered and
- /// decisions taken during planning.
- void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
- unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
-
/// Vectorize a single first-order recurrence or pointer induction PHINode in
/// a block. This method handles the induction variable canonicalization. It
/// supports both VF = 1 for unrolled loops and arbitrary length vectors.
@@ -511,9 +497,9 @@ public:
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
- /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+ /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
/// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
+ void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
@@ -538,15 +524,6 @@ public:
ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask = nullptr);
- /// Vectorize Load and Store instructions with the base address given in \p
- /// Addr, optionally masking the vector operations if \p BlockInMask is
- /// non-null. Use \p State to translate given VPValues to IR values in the
- /// vectorized loop.
- void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
- VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask,
- bool ConsecutiveStride, bool Reverse);
-
/// Set the debug location in the builder \p Ptr using the debug location in
/// \p V. If \p Ptr is None then it uses the class member's Builder.
void setDebugLocFromInst(const Value *V,
@@ -566,6 +543,17 @@ public:
/// element.
virtual Value *getBroadcastInstrs(Value *V);
+ /// Add metadata from one instruction to another.
+ ///
+ /// This includes both the original MDs from \p From and additional ones (\see
+ /// addNewMetadata). Use this for *newly created* instructions in the vector
+ /// loop.
+ void addMetadata(Instruction *To, Instruction *From);
+
+ /// Similar to the previous function but it adds the metadata to a
+ /// vector of instructions.
+ void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
protected:
friend class LoopVectorizationPlanner;
@@ -741,16 +729,16 @@ protected:
/// vector loop.
void addNewMetadata(Instruction *To, const Instruction *Orig);
- /// Add metadata from one instruction to another.
- ///
- /// This includes both the original MDs from \p From and additional ones (\see
- /// addNewMetadata). Use this for *newly created* instructions in the vector
- /// loop.
- void addMetadata(Instruction *To, Instruction *From);
-
- /// Similar to the previous function but it adds the metadata to a
- /// vector of instructions.
- void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ /// Collect poison-generating recipes that may generate a poison value that is
+ /// used after vectorization, even when their operands are not poison. Those
+ /// recipes meet the following conditions:
+ /// * Contribute to the address computation of a recipe generating a widen
+ /// memory load/store (VPWidenMemoryInstructionRecipe or
+ /// VPInterleaveRecipe).
+ /// * Such a widen memory load/store has at least one underlying Instruction
+ /// that is in a basic block that needs predication and after vectorization
+ /// the generated instruction won't be predicated.
+ void collectPoisonGeneratingRecipes(VPTransformState &State);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
@@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To,
LVer->annotateInstWithNoAlias(To, Orig);
}
+void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
+ VPTransformState &State) {
+
+ // Collect recipes in the backward slice of `Root` that may generate a poison
+ // value that is used after vectorization.
+ SmallPtrSet<VPRecipeBase *, 16> Visited;
+ auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
+ SmallVector<VPRecipeBase *, 16> Worklist;
+ Worklist.push_back(Root);
+
+ // Traverse the backward slice of Root through its use-def chain.
+ while (!Worklist.empty()) {
+ VPRecipeBase *CurRec = Worklist.back();
+ Worklist.pop_back();
+
+ if (!Visited.insert(CurRec).second)
+ continue;
+
+ // Prune search if we find another recipe generating a widen memory
+ // instruction. Widen memory instructions involved in address computation
+ // will lead to gather/scatter instructions, which don't need to be
+ // handled.
+ if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
+ isa<VPInterleaveRecipe>(CurRec))
+ continue;
+
+ // This recipe contributes to the address computation of a widen
+ // load/store. Collect recipe if its underlying instruction has
+ // poison-generating flags.
+ Instruction *Instr = CurRec->getUnderlyingInstr();
+ if (Instr && Instr->hasPoisonGeneratingFlags())
+ State.MayGeneratePoisonRecipes.insert(CurRec);
+
+ // Add new definitions to the worklist.
+ for (VPValue *operand : CurRec->operands())
+ if (VPDef *OpDef = operand->getDef())
+ Worklist.push_back(cast<VPRecipeBase>(OpDef));
+ }
+ });
+
+ // Traverse all the recipes in the VPlan and collect the poison-generating
+ // recipes in the backward slice starting at the address of a VPWidenRecipe or
+ // VPInterleaveRecipe.
+ auto Iter = depth_first(
+ VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+ for (VPRecipeBase &Recipe : *VPBB) {
+ if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
+ Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
+ VPDef *AddrDef = WidenRec->getAddr()->getDef();
+ if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
+ Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
+ collectPoisonGeneratingInstrsInBackwardSlice(
+ cast<VPRecipeBase>(AddrDef));
+ } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
+ VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
+ if (AddrDef) {
+ // Check if any member of the interleave group needs predication.
+ const InterleaveGroup<Instruction> *InterGroup =
+ InterleaveRec->getInterleaveGroup();
+ bool NeedPredication = false;
+ for (int I = 0, NumMembers = InterGroup->getNumMembers();
+ I < NumMembers; ++I) {
+ Instruction *Member = InterGroup->getMember(I);
+ if (Member)
+ NeedPredication |=
+ Legal->blockNeedsPredication(Member->getParent());
+ }
+
+ if (NeedPredication)
+ collectPoisonGeneratingInstrsInBackwardSlice(
+ cast<VPRecipeBase>(AddrDef));
+ }
+ }
+ }
+ }
+}
+
void InnerLoopVectorizer::addMetadata(Instruction *To,
Instruction *From) {
propagateMetadata(To, From);
@@ -1541,7 +1607,16 @@ public:
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I) {
+ bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+ // When we know the load is uniform and the original scalar loop was not
+ // predicated we don't need to mark it as a predicated instruction. Any
+ // vectorised blocks created when tail-folding are something artificial we
+ // have introduced and we know there is always at least one active lane.
+ // That's why we call Legal->blockNeedsPredication here because it doesn't
+ // query tail-folding.
+ if (IsKnownUniform && isa<LoadInst>(I) &&
+ !Legal->blockNeedsPredication(I->getParent()))
+ return false;
if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
// Loads and stores that need some form of masked operation are predicated
@@ -1816,9 +1891,11 @@ private:
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
- /// during vectorization. Non-uniform scalarized instructions will be
- /// represented by VF values in the vectorized loop, each corresponding to an
- /// iteration of the original scalar loop.
+ /// during vectorization. collectLoopScalars should only add non-uniform nodes
+ /// to the list if they are used by a load/store instruction that is marked as
+ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
+ /// VF values in the vectorized loop, each corresponding to an iteration of
+ /// the original scalar loop.
void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
@@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
}
-void InnerLoopVectorizer::vectorizeMemoryInstruction(
- Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
- VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
- bool Reverse) {
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(Instr);
- StoreInst *SI = dyn_cast<StoreInst>(Instr);
-
- assert((LI || SI) && "Invalid Load/Store instruction");
- assert((!SI || StoredValue) && "No stored value provided for widened store");
- assert((!LI || !StoredValue) && "Stored value provided for widened load");
-
- Type *ScalarDataTy = getLoadStoreType(Instr);
-
- auto *DataTy = VectorType::get(ScalarDataTy, VF);
- const Align Alignment = getLoadStoreAlignment(Instr);
- bool CreateGatherScatter = !ConsecutiveStride;
-
- VectorParts BlockInMaskParts(UF);
- bool isMaskRequired = BlockInMask;
- if (isMaskRequired)
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockInMaskParts[Part] = State.get(BlockInMask, Part);
-
- const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
- // Calculate the pointer for the specific unroll-part.
- GetElementPtrInst *PartPtr = nullptr;
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = gep->isInBounds();
- if (Reverse) {
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- // RunTimeVF = VScale * VF.getKnownMinValue()
- // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
- // NumElt = -Part * RunTimeVF
- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
- // LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
- PartPtr =
- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
- PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
- PartPtr->setIsInBounds(InBounds);
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
- } else {
- Value *Increment =
- createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
- PartPtr->setIsInBounds(InBounds);
- }
-
- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- };
-
- // Handle Stores:
- if (SI) {
- setDebugLocFromInst(SI);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Instruction *NewSI = nullptr;
- Value *StoredVal = State.get(StoredValue, Part);
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
- MaskPart);
- } else {
- if (Reverse) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = reverseVector(StoredVal);
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
- if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
- BlockInMaskParts[Part]);
- else
- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
- }
- addMetadata(NewSI, SI);
- }
- return;
- }
-
- // Handle loads.
- assert(LI && "Must have a load instruction");
- setDebugLocFromInst(LI);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *NewLI;
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
- nullptr, "wide.masked.gather");
- addMetadata(NewLI, LI);
- } else {
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
- if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(
- DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
- PoisonValue::get(DataTy), "wide.masked.load");
- else
- NewLI =
- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
-
- // Add metadata to the load, but setVectorValue to the reverse shuffle.
- addMetadata(NewLI, LI);
- if (Reverse)
- NewLI = reverseVector(NewLI);
- }
-
- State.set(Def, NewLI, Part);
- }
-}
-
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
- VPUser &User,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
bool IfPredicateInstr,
VPTransformState &State) {
@@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
+ // If the scalarized instruction contributes to the address computation of a
+ // widen masked load/store which was in a basic block that needed predication
+ // and is not predicated after vectorization, we can't propagate
+ // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
+ // instruction could feed a poison value to the base address of the widen
+ // load/store.
+ if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+ Cloned->dropPoisonGeneratingFlags();
+
State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+ for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
auto InputInstance = Instance;
if (!Operand || !OrigLoop->contains(Operand) ||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
InputInstance.Lane = VPLane::getFirstLane();
- auto *NewOp = State.get(User.getOperand(op), InputInstance);
+ auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- State.set(Def, Cloned, Instance);
+ State.set(RepRecipe, Cloned, Instance);
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
- VPUser &Operands, unsigned UF,
- ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant,
- VPTransformState &State) {
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
-
- if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- State.set(VPDef, EntryPart, Part);
- addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant
- ? State.get(Operands.getOperand(0), VPIteration(0, 0))
- : State.get(Operands.getOperand(0), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
- VPValue *Operand = Operands.getOperand(I);
- if (IsIndexLoopInvariant[I - 1])
- Indices.push_back(State.get(Operand, VPIteration(0, 0)));
- else
- Indices.push_back(State.get(Operand, Part));
- }
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP =
- GEP->isInBounds()
- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
- Indices)
- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
- State.set(VPDef, NewGEP, Part);
- addMetadata(NewGEP, GEP);
- }
- }
-}
-
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
VPWidenPHIRecipe *PhiR,
VPTransformState &State) {
@@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
- unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
-
- bool NeedsVectorIndex = !IsUniform && VF.isScalable();
- Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
- if (NeedsVectorIndex) {
- Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
- UnitStepVec = Builder.CreateStepVector(VecIVTy);
- PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
- }
+ assert((IsUniform || !State.VF.isScalable()) &&
+ "Cannot scalarize a scalable VF");
+ unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart =
createStepForVF(Builder, PtrInd->getType(), VF, Part);
- if (NeedsVectorIndex) {
- // Here we cache the whole vector, which means we can support the
- // extraction of any lane. However, in some cases the extractelement
- // instruction that is generated for scalar uses of this vector (e.g.
- // a load instruction) is not folded away. Therefore we still
- // calculate values for the first n lanes to avoid redundant moves
- // (when extracting the 0th element) and to produce scalar code (i.e.
- // additional add/gep instructions instead of expensive extractelement
- // instructions) when extracting higher-order elements.
- Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
- Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
- Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
- Value *SclrGep =
- emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
- SclrGep->setName("next.gep");
- State.set(PhiR, SclrGep, Part);
- }
-
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
@@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
- VPUser &User,
- VPTransformState &State) {
- switch (I.getOpcode()) {
- case Instruction::Call:
- case Instruction::Br:
- case Instruction::PHI:
- case Instruction::GetElementPtr:
- case Instruction::Select:
- llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::SRem:
- case Instruction::URem:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::FNeg:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Just widen unops and binops.
- setDebugLocFromInst(&I);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 2> Ops;
- for (VPValue *VPOp : User.operands())
- Ops.push_back(State.get(VPOp, Part));
-
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
-
- if (auto *VecOp = dyn_cast<Instruction>(V))
- VecOp->copyIRFlags(&I);
-
- // Use this vector value for all users of the original instruction.
- State.set(Def, V, Part);
- addMetadata(V, &I);
- }
-
- break;
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- setDebugLocFromInst(Cmp);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *B = State.get(User.getOperand(1), Part);
- Value *C = nullptr;
- if (FCmp) {
- // Propagate fast math flags.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- } else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
- }
- State.set(Def, C, Part);
- addMetadata(C, &I);
- }
-
- break;
- }
-
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto *CI = cast<CastInst>(&I);
- setDebugLocFromInst(CI);
-
- /// Vectorize casts.
- Type *DestTy =
- (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- State.set(Def, Cast, Part);
- addMetadata(Cast, &I);
- }
- break;
- }
- default:
- // This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
- llvm_unreachable("Unhandled instruction!");
- } // end of switch.
-}
-
void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPUser &ArgOperands,
VPTransformState &State) {
@@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
}
}
-void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
- VPUser &Operands,
- bool InvariantCond,
- VPTransformState &State) {
- setDebugLocFromInst(&I);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- auto *InvarCond = InvariantCond
- ? State.get(Operands.getOperand(0), VPIteration(0, 0))
- : nullptr;
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *Cond =
- InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
- Value *Op0 = State.get(Operands.getOperand(1), Part);
- Value *Op1 = State.get(Operands.getOperand(2), Part);
- Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
- State.set(VPDef, Sel, Part);
- addMetadata(Sel, &I);
- }
-}
-
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
@@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
!TheLoop->isLoopInvariant(V);
};
- auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
- if (!isa<PHINode>(Ptr) ||
- !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
- return false;
- auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
- if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
- return false;
- return isScalarUse(MemAccess, Ptr);
- };
-
- // A helper that evaluates a memory access's use of a pointer. If the
- // pointer is actually the pointer induction of a loop, it is being
- // inserted into Worklist. If the use will be a scalar use, and the
- // pointer is only used by memory accesses, we place the pointer in
- // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
+ // A helper that evaluates a memory access's use of a pointer. If the use will
+ // be a scalar use and the pointer is only used by memory accesses, we place
+ // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+ // PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
- if (isScalarPtrInduction(MemAccess, Ptr)) {
- Worklist.insert(cast<Instruction>(Ptr));
- LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
- << "\n");
-
- Instruction *Update = cast<Instruction>(
- cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-
- // If there is more than one user of Update (Ptr), we shouldn't assume it
- // will be scalar after vectorisation as other users of the instruction
- // may require widening. Otherwise, add it to ScalarPtrs.
- if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
- ScalarPtrs.insert(Update);
- return;
- }
- }
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;
+ // Returns true if \p Indvar is a pointer induction that is used directly by
+ // load/store instruction \p I.
+ auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
+ Instruction *I) {
+ return Induction.second.getKind() ==
+ InductionDescriptor::IK_PtrInduction &&
+ (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
+ };
+
// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(Ind, I);
});
if (!ScalarInd)
continue;
@@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto ScalarIndUpdate =
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
});
if (!ScalarIndUpdate)
continue;
@@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+ // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
+ // that it is being called from this specific place.
// Figure out whether the access is strided and get the stride value
// if it's known in compile time
@@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
InstructionCost BaseCost = TTI.getArithmeticReductionCost(
RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
+ // normal fmul instruction to the cost of the fadd reduction.
+ if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
+ BaseCost +=
+ TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
+
// If we're using ordered reductions then we can just return the base cost
// here, since getArithmeticReductionCost calculates the full ordered
// reduction cost when FP reassociation is not allowed.
@@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ return *RedCost;
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
@@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
State.CanonicalIV = ILV.Induction;
+ ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
DEBUG_WITH_TYPE(VerboseDebug, {
- dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+ dbgs() << "intermediate fn:\n"
+ << *OrigLoop->getHeader()->getParent() << "\n";
});
}
@@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
DEBUG_WITH_TYPE(VerboseDebug, {
- dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+ dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
});
}
@@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+ Range);
// Even if the instruction is not marked as uniform, there are certain
// intrinsic calls that can be effectively treated as such, so we check for
@@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (VPBB)
VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
else {
- Plan->setEntry(FirstVPBBForBB);
+ auto *TopRegion = new VPRegionBlock("vector loop");
+ TopRegion->setEntry(FirstVPBBForBB);
+ Plan->setEntry(TopRegion);
HeaderVPBB = FirstVPBBForBB;
}
VPBB = FirstVPBBForBB;
@@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- assert(isa<VPBasicBlock>(Plan->getEntry()) &&
+ assert(isa<VPRegionBlock>(Plan->getEntry()) &&
!Plan->getEntry()->getEntryBasicBlock()->empty() &&
- "entry block must be set to a non-empty VPBasicBlock");
+ "entry block must be set to a VPRegionBlock having a non-empty entry "
+ "VPBasicBlock");
+ cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
RecipeBuilder.fixHeaderPhis();
// ---------------------------------------------------------------------------
@@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned FirstOpId;
assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
"Only min/max recurrences allowed for inloop reductions");
+ // Recognize a call to the llvm.fmuladd intrinsic.
+ bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
+ assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
"Expected to replace a VPWidenSelectSC");
FirstOpId = 1;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
+ (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
"Expected to replace a VPWidenSC");
FirstOpId = 0;
}
@@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
auto *CondOp = CM.foldTailByMasking()
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
- VPReductionRecipe *RedRecipe = new VPReductionRecipe(
- &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+
+ if (IsFMulAdd) {
+ // If the instruction is a call to the llvm.fmuladd intrinsic then we
+ // need to create an fmul recipe to use as the vector operand for the
+ // fadd reduction.
+ VPInstruction *FMulRecipe = new VPInstruction(
+ Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
+ FMulRecipe->setFastMathFlags(R->getFastMathFlags());
+ WidenRecipe->getParent()->insert(FMulRecipe,
+ WidenRecipe->getIterator());
+ VecOp = FMulRecipe;
+ }
+ VPReductionRecipe *RedRecipe =
+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
@@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
}
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
- this, *this, InvariantCond, State);
+ auto &I = *cast<SelectInst>(getUnderlyingInstr());
+ State.ILV->setDebugLocFromInst(&I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ auto *InvarCond =
+ InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
+ Value *Op0 = State.get(getOperand(1), Part);
+ Value *Op1 = State.get(getOperand(2), Part);
+ Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
+ State.set(this, Sel, Part);
+ State.ILV->addMetadata(Sel, &I);
+ }
}
void VPWidenRecipe::execute(VPTransformState &State) {
- State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
+ auto &I = *cast<Instruction>(getUnderlyingValue());
+ auto &Builder = State.Builder;
+ switch (I.getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Br:
+ case Instruction::PHI:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ llvm_unreachable("This instruction is handled by a different recipe.");
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::FNeg:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen unops and binops.
+ State.ILV->setDebugLocFromInst(&I);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ SmallVector<Value *, 2> Ops;
+ for (VPValue *VPOp : operands())
+ Ops.push_back(State.get(VPOp, Part));
+
+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+ if (auto *VecOp = dyn_cast<Instruction>(V)) {
+ VecOp->copyIRFlags(&I);
+
+ // If the instruction is vectorized and was in a basic block that needed
+ // predication, we can't propagate poison-generating flags (nuw/nsw,
+ // exact, etc.). The control flow has been linearized and the
+ // instruction is no longer guarded by the predicate, which could make
+ // the flag properties to no longer hold.
+ if (State.MayGeneratePoisonRecipes.count(this) > 0)
+ VecOp->dropPoisonGeneratingFlags();
+ }
+
+ // Use this vector value for all users of the original instruction.
+ State.set(this, V, Part);
+ State.ILV->addMetadata(V, &I);
+ }
+
+ break;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (I.getOpcode() == Instruction::FCmp);
+ auto *Cmp = cast<CmpInst>(&I);
+ State.ILV->setDebugLocFromInst(Cmp);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *C = nullptr;
+ if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ }
+ State.set(this, C, Part);
+ State.ILV->addMetadata(C, &I);
+ }
+
+ break;
+ }
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto *CI = cast<CastInst>(&I);
+ State.ILV->setDebugLocFromInst(CI);
+
+ /// Vectorize casts.
+ Type *DestTy = (State.VF.isScalar())
+ ? CI->getType()
+ : VectorType::get(CI->getType(), State.VF);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+ State.set(this, Cast, Part);
+ State.ILV->addMetadata(Cast, &I);
+ }
+ break;
+ }
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
- State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
- *this, State.UF, State.VF, IsPtrLoopInvariant,
- IsIndexLoopInvariant, State);
+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
+ if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = State.Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
+ State.set(this, EntryPart, Part);
+ State.ILV->addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant
+ ? State.get(getOperand(0), VPIteration(0, 0))
+ : State.get(getOperand(0), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+ VPValue *Operand = getOperand(I);
+ if (IsIndexLoopInvariant[I - 1])
+ Indices.push_back(State.get(Operand, VPIteration(0, 0)));
+ else
+ Indices.push_back(State.get(Operand, Part));
+ }
+
+ // If the GEP instruction is vectorized and was in a basic block that
+ // needed predication, we can't propagate the poison-generating 'inbounds'
+ // flag. The control flow has been linearized and the GEP is no longer
+ // guarded by the predicate, which could make the 'inbounds' properties to
+ // no longer hold.
+ bool IsInBounds =
+ GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP = IsInBounds
+ ? State.Builder.CreateInBoundsGEP(
+ GEP->getSourceElementType(), Ptr, Indices)
+ : State.Builder.CreateGEP(GEP->getSourceElementType(),
+ Ptr, Indices);
+ assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ State.set(this, NewGEP, Part);
+ State.ILV->addMetadata(NewGEP, GEP);
+ }
+ }
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
@@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
- *State.Instance, IsPredicated, State);
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
+ IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from poison.
@@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
"Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
VPIteration(Part, Lane), IsPredicated,
State);
}
@@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
- State.ILV->vectorizeMemoryInstruction(
- &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
- StoredValue, getMask(), Consecutive, Reverse);
+
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
+ StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+ assert((!SI || StoredValue) && "No stored value provided for widened store");
+ assert((!LI || !StoredValue) && "Stored value provided for widened load");
+
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ bool CreateGatherScatter = !Consecutive;
+
+ auto &Builder = State.Builder;
+ InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
+ bool isMaskRequired = getMask();
+ if (isMaskRequired)
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ BlockInMaskParts[Part] = State.get(getMask(), Part);
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ // RunTimeVF = VScale * VF.getKnownMinValue()
+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+ Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+ // NumElt = -Part * RunTimeVF
+ Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ // LastLane = 1 - RunTimeVF
+ Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ PartPtr =
+ cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
+ PartPtr->setIsInBounds(InBounds);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ BlockInMaskParts[Part] =
+ Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
+ } else {
+ Value *Increment =
+ createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
+ // Handle Stores:
+ if (SI) {
+ State.ILV->setDebugLocFromInst(SI);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Instruction *NewSI = nullptr;
+ Value *StoredVal = State.get(StoredValue, Part);
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(getAddr(), Part);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+ MaskPart);
+ } else {
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't call resetVectorValue(StoredVal).
+ }
+ auto *VecPtr =
+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ if (isMaskRequired)
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+ BlockInMaskParts[Part]);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+ }
+ State.ILV->addMetadata(NewSI, SI);
+ }
+ return;
+ }
+
+ // Handle loads.
+ assert(LI && "Must have a load instruction");
+ State.ILV->setDebugLocFromInst(LI);
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *NewLI;
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(getAddr(), Part);
+ NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+ nullptr, "wide.masked.gather");
+ State.ILV->addMetadata(NewLI, LI);
+ } else {
+ auto *VecPtr =
+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ if (isMaskRequired)
+ NewLI = Builder.CreateMaskedLoad(
+ DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
+ PoisonValue::get(DataTy), "wide.masked.load");
+ else
+ NewLI =
+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+
+ // Add metadata to the load, but setVectorValue to the reverse shuffle.
+ State.ILV->addMetadata(NewLI, LI);
+ if (Reverse)
+ NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
+ }
+
+ State.set(getVPSingleValue(), NewLI, Part);
+ }
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising