diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:02:54 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:02:54 +0000 |
commit | f65dcba83ce5035ab88a85fe17628b447eb56e1b (patch) | |
tree | 35f37bb72b3cfc6060193e66c76ee7c9478969b0 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | 846a2208a8ab099f595fe7e8b2e6d54a7b5e67fb (diff) |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 980 |
1 files changed, 527 insertions, 453 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23bb6f0860c9..5ca0adb4242c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -473,18 +473,10 @@ public: /// handle the more complex control flow around the loops. virtual BasicBlock *createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, - VPTransformState &State); - /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); - /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, - bool InvariantCond, VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State); @@ -496,12 +488,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single GetElementPtrInst based on information gathered and - /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, - unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); - /// Vectorize a single first-order recurrence or pointer induction PHINode in /// a block. This method handles the induction variable canonicalization. It /// supports both VF = 1 for unrolled loops and arbitrary length vectors. @@ -511,9 +497,9 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -538,15 +524,6 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Vectorize Load and Store instructions with the base address given in \p - /// Addr, optionally masking the vector operations if \p BlockInMask is - /// non-null. Use \p State to translate given VPValues to IR values in the - /// vectorized loop. - void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, - bool ConsecutiveStride, bool Reverse); - /// Set the debug location in the builder \p Ptr using the debug location in /// \p V. If \p Ptr is None then it uses the class member's Builder. void setDebugLocFromInst(const Value *V, @@ -566,6 +543,17 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, Instruction *From); + + /// Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(ArrayRef<Value *> To, Instruction *From); + protected: friend class LoopVectorizationPlanner; @@ -741,16 +729,16 @@ protected: /// vector loop. void addNewMetadata(Instruction *To, const Instruction *Orig); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Collect poison-generating recipes that may generate a poison value that is + /// used after vectorization, even when their operands are not poison. Those + /// recipes meet the following conditions: + /// * Contribute to the address computation of a recipe generating a widen + /// memory load/store (VPWidenMemoryInstructionRecipe or + /// VPInterleaveRecipe). + /// * Such a widen memory load/store has at least one underlying Instruction + /// that is in a basic block that needs predication and after vectorization + /// the generated instruction won't be predicated. + void collectPoisonGeneratingRecipes(VPTransformState &State); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. @@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To, LVer->annotateInstWithNoAlias(To, Orig); } +void InnerLoopVectorizer::collectPoisonGeneratingRecipes( + VPTransformState &State) { + + // Collect recipes in the backward slice of `Root` that may generate a poison + // value that is used after vectorization. + SmallPtrSet<VPRecipeBase *, 16> Visited; + auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { + SmallVector<VPRecipeBase *, 16> Worklist; + Worklist.push_back(Root); + + // Traverse the backward slice of Root through its use-def chain. + while (!Worklist.empty()) { + VPRecipeBase *CurRec = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(CurRec).second) + continue; + + // Prune search if we find another recipe generating a widen memory + // instruction. Widen memory instructions involved in address computation + // will lead to gather/scatter instructions, which don't need to be + // handled. + if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || + isa<VPInterleaveRecipe>(CurRec)) + continue; + + // This recipe contributes to the address computation of a widen + // load/store. Collect recipe if its underlying instruction has + // poison-generating flags. + Instruction *Instr = CurRec->getUnderlyingInstr(); + if (Instr && Instr->hasPoisonGeneratingFlags()) + State.MayGeneratePoisonRecipes.insert(CurRec); + + // Add new definitions to the worklist. + for (VPValue *operand : CurRec->operands()) + if (VPDef *OpDef = operand->getDef()) + Worklist.push_back(cast<VPRecipeBase>(OpDef)); + } + }); + + // Traverse all the recipes in the VPlan and collect the poison-generating + // recipes in the backward slice starting at the address of a VPWidenRecipe or + // VPInterleaveRecipe. + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &Recipe : *VPBB) { + if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { + Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + VPDef *AddrDef = WidenRec->getAddr()->getDef(); + if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && + Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { + VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); + if (AddrDef) { + // Check if any member of the interleave group needs predication. + const InterleaveGroup<Instruction> *InterGroup = + InterleaveRec->getInterleaveGroup(); + bool NeedPredication = false; + for (int I = 0, NumMembers = InterGroup->getNumMembers(); + I < NumMembers; ++I) { + Instruction *Member = InterGroup->getMember(I); + if (Member) + NeedPredication |= + Legal->blockNeedsPredication(Member->getParent()); + } + + if (NeedPredication) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } + } + } + } +} + void InnerLoopVectorizer::addMetadata(Instruction *To, Instruction *From) { propagateMetadata(To, From); @@ -1541,7 +1607,16 @@ public: // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + // When we know the load is uniform and the original scalar loop was not + // predicated we don't need to mark it as a predicated instruction. Any + // vectorised blocks created when tail-folding are something artificial we + // have introduced and we know there is always at least one active lane. + // That's why we call Legal->blockNeedsPredication here because it doesn't + // query tail-folding. + if (IsKnownUniform && isa<LoadInst>(I) && + !Legal->blockNeedsPredication(I->getParent())) + return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated @@ -1816,9 +1891,11 @@ private: /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. + /// during vectorization. collectLoopScalars should only add non-uniform nodes + /// to the list if they are used by a load/store instruction that is marked as + /// CM_Scalarize. Non-uniform scalarized instructions will be represented by + /// VF values in the vectorized loop, each corresponding to an iteration of + /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. @@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction( - Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, - bool Reverse) { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); - - Type *ScalarDataTy = getLoadStoreType(Instr); - - auto *DataTy = VectorType::get(ScalarDataTy, VF); - const Align Alignment = getLoadStoreAlignment(Instr); - bool CreateGatherScatter = !ConsecutiveStride; - - VectorParts BlockInMaskParts(UF); - bool isMaskRequired = BlockInMask; - if (isMaskRequired) - for (unsigned Part = 0; Part < UF; ++Part) - BlockInMaskParts[Part] = State.get(BlockInMask, Part); - - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { - // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); - PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); - } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); - } - - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - }; - - // Handle Stores: - if (SI) { - setDebugLocFromInst(SI); - - for (unsigned Part = 0; Part < UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = reverseVector(StoredVal); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - addMetadata(NewSI, SI); - } - return; - } - - // Handle loads. - assert(LI && "Must have a load instruction"); - setDebugLocFromInst(LI); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *NewLI; - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); - addMetadata(NewLI, LI); - } else { - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - addMetadata(NewLI, LI); - if (Reverse) - NewLI = reverseVector(NewLI); - } - - State.set(Def, NewLI, Part); - } -} - -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store which was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized + // instruction could feed a poison value to the base address of the widen + // load/store. + if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + Cloned->dropPoisonGeneratingFlags(); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<AssumeInst>(Cloned)) @@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, - VPUser &Operands, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, - VPTransformState &State) { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, EntryPart, Part); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : State.get(Operands.getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { - VPValue *Operand = Operands.getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(VPDef, NewGEP, Part); - addMetadata(NewGEP, GEP); - } - } -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, VPTransformState &State) { @@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); - - bool NeedsVectorIndex = !IsUniform && VF.isScalable(); - Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; - if (NeedsVectorIndex) { - Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); - UnitStepVec = Builder.CreateStepVector(VecIVTy); - PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); - } + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF(Builder, PtrInd->getType(), VF, Part); - if (NeedsVectorIndex) { - // Here we cache the whole vector, which means we can support the - // extraction of any lane. However, in some cases the extractelement - // instruction that is generated for scalar uses of this vector (e.g. - // a load instruction) is not folded away. Therefore we still - // calculate values for the first n lanes to avoid redundant moves - // (when extracting the 0th element) and to produce scalar code (i.e. - // additional add/gep instructions instead of expensive extractelement - // instructions) when extracting higher-order elements. - Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); - Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); - Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, Part); - } - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); @@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, - VPUser &User, - VPTransformState &State) { - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : User.operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) - VecOp->copyIRFlags(&I); - - // Use this vector value for all users of the original instruction. - State.set(Def, V, Part); - addMetadata(V, &I); - } - - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *B = State.get(User.getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(Def, C, Part); - addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, Cast, Part); - addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State) { @@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, - VPUser &Operands, - bool InvariantCond, - VPTransformState &State) { - setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = InvariantCond - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : nullptr; - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = - InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); - Value *Op0 = State.get(Operands.getOperand(1), Part); - Value *Op1 = State.get(Operands.getOperand(2), Part); - Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, Sel, Part); - addMetadata(Sel, &I); - } -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa<PHINode>(Ptr) || - !Legal->getInductionVars().count(cast<PHINode>(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - - // A helper that evaluates a memory access's use of a pointer. If the - // pointer is actually the pointer induction of a loop, it is being - // inserted into Worklist. If the use will be a scalar use, and the - // pointer is only used by memory accesses, we place the pointer in - // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. + // A helper that evaluates a memory access's use of a pointer. If the use will + // be a scalar use and the pointer is only used by memory accesses, we place + // the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast<Instruction>(Ptr)); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - - // If there is more than one user of Update (Ptr), we shouldn't assume it - // will be scalar after vectorisation as other users of the instruction - // may require widening. Otherwise, add it to ScalarPtrs. - if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { - ScalarPtrs.insert(Update); - return; - } - } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; + // Returns true if \p Indvar is a pointer induction that is used directly by + // load/store instruction \p I. + auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, + Instruction *I) { + return Induction.second.getKind() == + InductionDescriptor::IK_PtrInduction && + (isa<LoadInst>(I) || isa<StoreInst>(I)) && + Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + }; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; @@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; @@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` + // that it is being called from this specific place. // Figure out whether the access is strided and get the stride value // if it's known in compile time @@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + // For a call to the llvm.fmuladd intrinsic we need to add the cost of a + // normal fmul instruction to the cost of the fadd reduction. + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + BaseCost += + TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); + // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. @@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); @@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "intermediate fn:\n" + << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + Range); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (VPBB) VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); else { - Plan->setEntry(FirstVPBBForBB); + auto *TopRegion = new VPRegionBlock("vector loop"); + TopRegion->setEntry(FirstVPBBForBB); + Plan->setEntry(TopRegion); HeaderVPBB = FirstVPBBForBB; } VPBB = FirstVPBBForBB; @@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - assert(isa<VPBasicBlock>(Plan->getEntry()) && + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && - "entry block must be set to a non-empty VPBasicBlock"); + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && + assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || + (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + + if (IsFMulAdd) { + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe to use as the vector operand for the + // fadd reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); + FMulRecipe->setFastMathFlags(R->getFastMathFlags()); + WidenRecipe->getParent()->insert(FMulRecipe, + WidenRecipe->getIterator()); + VecOp = FMulRecipe; + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), - this, *this, InvariantCond, State); + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.ILV->setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.ILV->addMetadata(Sel, &I); + } } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.ILV->setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.count(this) > 0) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.ILV->addMetadata(V, &I); + } + + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.ILV->setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.ILV->addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.ILV->setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.ILV->addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, - *this, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant, State); + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.ILV->addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = IsInBounds + ? State.Builder.CreateInBoundsGEP( + GEP->getSourceElementType(), Ptr, Indices) + : State.Builder.CreateGEP(GEP->getSourceElementType(), + Ptr, Indices); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.ILV->addMetadata(NewGEP, GEP); + } + } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } @@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction( - &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), - StoredValue, getMask(), Consecutive, Reverse); + + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); + StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); + + assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGatherScatter = !Consecutive; + + auto &Builder = State.Builder; + InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); + bool isMaskRequired = getMask(); + if (isMaskRequired) + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = State.get(getMask(), Part); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + BlockInMaskParts[Part] = + Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); + } else { + Value *Increment = + createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); + PartPtr->setIsInBounds(InBounds); + } + + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + + // Handle Stores: + if (SI) { + State.ILV->setDebugLocFromInst(SI); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, Part); + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } else { + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } + State.ILV->addMetadata(NewSI, SI); + } + return; + } + + // Handle loads. + assert(LI && "Must have a load instruction"); + State.ILV->setDebugLocFromInst(LI); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + State.ILV->addMetadata(NewLI, LI); + } else { + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + else + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.ILV->addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + } + + State.set(getVPSingleValue(), NewLI, Part); + } } // Determine how to lower the scalar epilogue, which depends on 1) optimising |