diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-02 21:49:08 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-06-04 11:59:04 +0000 |
| commit | 574b7079b96703a748f89ef5adb7dc3e26b8f7fc (patch) | |
| tree | 195000196b1e0cc13dea43258fa240e006f48184 /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
| parent | 1f6fd64fe9c996b4795ee4a6c66b8f9216747560 (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
4 files changed, 1027 insertions, 608 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23bb6f0860c9..5ca0adb4242c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -473,18 +473,10 @@ public: /// handle the more complex control flow around the loops. virtual BasicBlock *createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, - VPTransformState &State); - /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); - /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, - bool InvariantCond, VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State); @@ -496,12 +488,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single GetElementPtrInst based on information gathered and - /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, - unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); - /// Vectorize a single first-order recurrence or pointer induction PHINode in /// a block. This method handles the induction variable canonicalization. It /// supports both VF = 1 for unrolled loops and arbitrary length vectors. @@ -511,9 +497,9 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -538,15 +524,6 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Vectorize Load and Store instructions with the base address given in \p - /// Addr, optionally masking the vector operations if \p BlockInMask is - /// non-null. Use \p State to translate given VPValues to IR values in the - /// vectorized loop. - void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, - bool ConsecutiveStride, bool Reverse); - /// Set the debug location in the builder \p Ptr using the debug location in /// \p V. If \p Ptr is None then it uses the class member's Builder. void setDebugLocFromInst(const Value *V, @@ -566,6 +543,17 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, Instruction *From); + + /// Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(ArrayRef<Value *> To, Instruction *From); + protected: friend class LoopVectorizationPlanner; @@ -741,16 +729,16 @@ protected: /// vector loop. void addNewMetadata(Instruction *To, const Instruction *Orig); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Collect poison-generating recipes that may generate a poison value that is + /// used after vectorization, even when their operands are not poison. Those + /// recipes meet the following conditions: + /// * Contribute to the address computation of a recipe generating a widen + /// memory load/store (VPWidenMemoryInstructionRecipe or + /// VPInterleaveRecipe). + /// * Such a widen memory load/store has at least one underlying Instruction + /// that is in a basic block that needs predication and after vectorization + /// the generated instruction won't be predicated. + void collectPoisonGeneratingRecipes(VPTransformState &State); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. @@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To, LVer->annotateInstWithNoAlias(To, Orig); } +void InnerLoopVectorizer::collectPoisonGeneratingRecipes( + VPTransformState &State) { + + // Collect recipes in the backward slice of `Root` that may generate a poison + // value that is used after vectorization. + SmallPtrSet<VPRecipeBase *, 16> Visited; + auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { + SmallVector<VPRecipeBase *, 16> Worklist; + Worklist.push_back(Root); + + // Traverse the backward slice of Root through its use-def chain. + while (!Worklist.empty()) { + VPRecipeBase *CurRec = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(CurRec).second) + continue; + + // Prune search if we find another recipe generating a widen memory + // instruction. Widen memory instructions involved in address computation + // will lead to gather/scatter instructions, which don't need to be + // handled. + if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || + isa<VPInterleaveRecipe>(CurRec)) + continue; + + // This recipe contributes to the address computation of a widen + // load/store. Collect recipe if its underlying instruction has + // poison-generating flags. + Instruction *Instr = CurRec->getUnderlyingInstr(); + if (Instr && Instr->hasPoisonGeneratingFlags()) + State.MayGeneratePoisonRecipes.insert(CurRec); + + // Add new definitions to the worklist. + for (VPValue *operand : CurRec->operands()) + if (VPDef *OpDef = operand->getDef()) + Worklist.push_back(cast<VPRecipeBase>(OpDef)); + } + }); + + // Traverse all the recipes in the VPlan and collect the poison-generating + // recipes in the backward slice starting at the address of a VPWidenRecipe or + // VPInterleaveRecipe. + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &Recipe : *VPBB) { + if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { + Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + VPDef *AddrDef = WidenRec->getAddr()->getDef(); + if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && + Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { + VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); + if (AddrDef) { + // Check if any member of the interleave group needs predication. + const InterleaveGroup<Instruction> *InterGroup = + InterleaveRec->getInterleaveGroup(); + bool NeedPredication = false; + for (int I = 0, NumMembers = InterGroup->getNumMembers(); + I < NumMembers; ++I) { + Instruction *Member = InterGroup->getMember(I); + if (Member) + NeedPredication |= + Legal->blockNeedsPredication(Member->getParent()); + } + + if (NeedPredication) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } + } + } + } +} + void InnerLoopVectorizer::addMetadata(Instruction *To, Instruction *From) { propagateMetadata(To, From); @@ -1541,7 +1607,16 @@ public: // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + // When we know the load is uniform and the original scalar loop was not + // predicated we don't need to mark it as a predicated instruction. Any + // vectorised blocks created when tail-folding are something artificial we + // have introduced and we know there is always at least one active lane. + // That's why we call Legal->blockNeedsPredication here because it doesn't + // query tail-folding. + if (IsKnownUniform && isa<LoadInst>(I) && + !Legal->blockNeedsPredication(I->getParent())) + return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated @@ -1816,9 +1891,11 @@ private: /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. + /// during vectorization. collectLoopScalars should only add non-uniform nodes + /// to the list if they are used by a load/store instruction that is marked as + /// CM_Scalarize. Non-uniform scalarized instructions will be represented by + /// VF values in the vectorized loop, each corresponding to an iteration of + /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. @@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction( - Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, - bool Reverse) { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); - - Type *ScalarDataTy = getLoadStoreType(Instr); - - auto *DataTy = VectorType::get(ScalarDataTy, VF); - const Align Alignment = getLoadStoreAlignment(Instr); - bool CreateGatherScatter = !ConsecutiveStride; - - VectorParts BlockInMaskParts(UF); - bool isMaskRequired = BlockInMask; - if (isMaskRequired) - for (unsigned Part = 0; Part < UF; ++Part) - BlockInMaskParts[Part] = State.get(BlockInMask, Part); - - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { - // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); - PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); - } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); - } - - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - }; - - // Handle Stores: - if (SI) { - setDebugLocFromInst(SI); - - for (unsigned Part = 0; Part < UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = reverseVector(StoredVal); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - addMetadata(NewSI, SI); - } - return; - } - - // Handle loads. - assert(LI && "Must have a load instruction"); - setDebugLocFromInst(LI); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *NewLI; - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); - addMetadata(NewLI, LI); - } else { - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - addMetadata(NewLI, LI); - if (Reverse) - NewLI = reverseVector(NewLI); - } - - State.set(Def, NewLI, Part); - } -} - -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store which was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized + // instruction could feed a poison value to the base address of the widen + // load/store. + if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + Cloned->dropPoisonGeneratingFlags(); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<AssumeInst>(Cloned)) @@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, - VPUser &Operands, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, - VPTransformState &State) { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, EntryPart, Part); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : State.get(Operands.getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { - VPValue *Operand = Operands.getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(VPDef, NewGEP, Part); - addMetadata(NewGEP, GEP); - } - } -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, VPTransformState &State) { @@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); - - bool NeedsVectorIndex = !IsUniform && VF.isScalable(); - Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; - if (NeedsVectorIndex) { - Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); - UnitStepVec = Builder.CreateStepVector(VecIVTy); - PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); - } + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF(Builder, PtrInd->getType(), VF, Part); - if (NeedsVectorIndex) { - // Here we cache the whole vector, which means we can support the - // extraction of any lane. However, in some cases the extractelement - // instruction that is generated for scalar uses of this vector (e.g. - // a load instruction) is not folded away. Therefore we still - // calculate values for the first n lanes to avoid redundant moves - // (when extracting the 0th element) and to produce scalar code (i.e. - // additional add/gep instructions instead of expensive extractelement - // instructions) when extracting higher-order elements. - Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); - Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); - Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, Part); - } - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); @@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, - VPUser &User, - VPTransformState &State) { - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : User.operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) - VecOp->copyIRFlags(&I); - - // Use this vector value for all users of the original instruction. - State.set(Def, V, Part); - addMetadata(V, &I); - } - - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *B = State.get(User.getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(Def, C, Part); - addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, Cast, Part); - addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State) { @@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, - VPUser &Operands, - bool InvariantCond, - VPTransformState &State) { - setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = InvariantCond - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : nullptr; - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = - InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); - Value *Op0 = State.get(Operands.getOperand(1), Part); - Value *Op1 = State.get(Operands.getOperand(2), Part); - Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, Sel, Part); - addMetadata(Sel, &I); - } -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa<PHINode>(Ptr) || - !Legal->getInductionVars().count(cast<PHINode>(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - - // A helper that evaluates a memory access's use of a pointer. If the - // pointer is actually the pointer induction of a loop, it is being - // inserted into Worklist. If the use will be a scalar use, and the - // pointer is only used by memory accesses, we place the pointer in - // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. + // A helper that evaluates a memory access's use of a pointer. If the use will + // be a scalar use and the pointer is only used by memory accesses, we place + // the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast<Instruction>(Ptr)); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - - // If there is more than one user of Update (Ptr), we shouldn't assume it - // will be scalar after vectorisation as other users of the instruction - // may require widening. Otherwise, add it to ScalarPtrs. - if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { - ScalarPtrs.insert(Update); - return; - } - } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; + // Returns true if \p Indvar is a pointer induction that is used directly by + // load/store instruction \p I. + auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, + Instruction *I) { + return Induction.second.getKind() == + InductionDescriptor::IK_PtrInduction && + (isa<LoadInst>(I) || isa<StoreInst>(I)) && + Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + }; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; @@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; @@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` + // that it is being called from this specific place. // Figure out whether the access is strided and get the stride value // if it's known in compile time @@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + // For a call to the llvm.fmuladd intrinsic we need to add the cost of a + // normal fmul instruction to the cost of the fadd reduction. + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + BaseCost += + TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); + // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. @@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); @@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "intermediate fn:\n" + << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + Range); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (VPBB) VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); else { - Plan->setEntry(FirstVPBBForBB); + auto *TopRegion = new VPRegionBlock("vector loop"); + TopRegion->setEntry(FirstVPBBForBB); + Plan->setEntry(TopRegion); HeaderVPBB = FirstVPBBForBB; } VPBB = FirstVPBBForBB; @@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - assert(isa<VPBasicBlock>(Plan->getEntry()) && + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && - "entry block must be set to a non-empty VPBasicBlock"); + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && + assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || + (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + + if (IsFMulAdd) { + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe to use as the vector operand for the + // fadd reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); + FMulRecipe->setFastMathFlags(R->getFastMathFlags()); + WidenRecipe->getParent()->insert(FMulRecipe, + WidenRecipe->getIterator()); + VecOp = FMulRecipe; + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), - this, *this, InvariantCond, State); + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.ILV->setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.ILV->addMetadata(Sel, &I); + } } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.ILV->setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.count(this) > 0) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.ILV->addMetadata(V, &I); + } + + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.ILV->setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.ILV->addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.ILV->setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.ILV->addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, - *this, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant, State); + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.ILV->addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = IsInBounds + ? State.Builder.CreateInBoundsGEP( + GEP->getSourceElementType(), Ptr, Indices) + : State.Builder.CreateGEP(GEP->getSourceElementType(), + Ptr, Indices); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.ILV->addMetadata(NewGEP, GEP); + } + } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } @@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction( - &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), - StoredValue, getMask(), Consecutive, Reverse); + + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); + StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); + + assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGatherScatter = !Consecutive; + + auto &Builder = State.Builder; + InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); + bool isMaskRequired = getMask(); + if (isMaskRequired) + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = State.get(getMask(), Part); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + BlockInMaskParts[Part] = + Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); + } else { + Value *Increment = + createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); + PartPtr->setIsInBounds(InBounds); + } + + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + + // Handle Stores: + if (SI) { + State.ILV->setDebugLocFromInst(SI); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, Part); + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } else { + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } + State.ILV->addMetadata(NewSI, SI); + } + return; + } + + // Handle loads. + assert(LI && "Must have a load instruction"); + State.ILV->setDebugLocFromInst(LI); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + State.ILV->addMetadata(NewLI, LI); + } else { + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + else + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.ILV->addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + } + + State.set(getVPSingleValue(), NewLI, Part); + } } // Determine how to lower the scalar epilogue, which depends on 1) optimising diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3ef0b794f68..95061e9053fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) { return false; } +/// Checks if the given value is actually an undefined constant vector. +static bool isUndefVector(const Value *V) { + if (isa<UndefValue>(V)) + return true; + auto *C = dyn_cast<Constant>(V); + if (!C) + return false; + if (!C->containsUndefOrPoisonElement()) + return false; + auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); + if (!VecTy) + return false; + for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { + if (Constant *Elem = C->getAggregateElement(I)) + if (!isa<UndefValue>(Elem)) + return false; + } + return true; +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) { /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { - auto *EI0 = cast<ExtractElementInst>(VL[0]); + const auto *It = + find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); + if (It == VL.end()) + return None; + auto *EI0 = cast<ExtractElementInst>(*It); if (isa<ScalableVectorType>(EI0->getVectorOperandType())) return None; unsigned Size = @@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; + Mask.assign(VL.size(), UndefMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { + // Undef can be represented as an undef element in a vector. + if (isa<UndefValue>(VL[I])) + continue; auto *EI = cast<ExtractElementInst>(VL[I]); + if (isa<ScalableVectorType>(EI->getVectorOperandType())) + return None; auto *Vec = EI->getVectorOperand(); + // We can extractelement from undef or poison vector. + if (isUndefVector(Vec)) + continue; // All vector operands must have the same number of vector elements. if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; + if (isa<UndefValue>(EI->getIndexOperand())) + continue; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) { - Mask.push_back(UndefMaskElem); + if (Idx->getValue().uge(Size)) continue; - } unsigned IntIdx = Idx->getValue().getZExtValue(); - Mask.push_back(IntIdx); - // We can extractelement from undef or poison vector. - if (isa<UndefValue>(Vec)) - continue; + Mask[I] = IntIdx; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. - if (!Vec1 || Vec1 == Vec) + if (!Vec1 || Vec1 == Vec) { Vec1 = Vec; - else if (!Vec2 || Vec2 == Vec) + } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; - else + Mask[I] += Size; + } else { return None; + } if (CommonShuffleMode == Permute) continue; // If the extract index is not the same as the operation number, it is a @@ -1680,6 +1712,28 @@ private: return IsSame(Scalars, ReuseShuffleIndices); } + /// \returns true if current entry has same operands as \p TE. + bool hasEqualOperands(const TreeEntry &TE) const { + if (TE.getNumOperands() != getNumOperands()) + return false; + SmallBitVector Used(getNumOperands()); + for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { + unsigned PrevCount = Used.count(); + for (unsigned K = 0; K < E; ++K) { + if (Used.test(K)) + continue; + if (getOperand(K) == TE.getOperand(I)) { + Used.set(K); + break; + } + } + // Check if we actually found the matching operand. + if (PrevCount == Used.count()) + return false; + } + return true; + } + /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. @@ -1773,6 +1827,12 @@ private: return Operands[OpIdx]; } + /// \returns the \p OpIdx operand of this TreeEntry. + ArrayRef<Value *> getOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } @@ -2078,7 +2138,7 @@ private: SmallPtrSet<const Value *, 32> EphValues; /// Holds all of the instructions that we gathered. - SetVector<Instruction *> GatherSeq; + SetVector<Instruction *> GatherShuffleSeq; /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool IsGather) { DenseMap<Value *, int> ExtractVectorsTys; for (auto *V : VL) { + if (isa<UndefValue>(V)) + continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || - (IsGather && ScalarToTreeEntry.count(V))) + if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) continue; auto *EE = cast<ExtractElementInst>(V); - unsigned Idx = *getExtractIndex(EE); + Optional<unsigned> EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; if (TTIRef.getNumberOfParts(VecTy) != TTIRef.getNumberOfParts(EE->getVectorOperandType())) { auto It = @@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); + if (Data.second % NumElts == 0) + continue; if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); @@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } - if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && - allSameBlock(VL) && - !isa<ScalableVectorType>( - cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { + if ((E->getOpcode() == Instruction::ExtractElement || + all_of(E->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && + allSameType(VL)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; @@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); })); - if (isa<UndefValue>(FirstInsert->getOperand(0))) { + if (isUndefVector(FirstInsert->getOperand(0))) { Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); } else { SmallVector<int> InsertMask(NumElts); @@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - if (Instruction::isBinaryOp(E->getOpcode())) { + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; + } + return false; + }; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); @@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { [this](Value *V) { return EphValues.contains(V); }) && (allConstant(TE->Scalars) || isSplat(TE->Scalars) || TE->Scalars.size() < Limit || - (TE->getOpcode() == Instruction::ExtractElement && + ((TE->getOpcode() == Instruction::ExtractElement || + all_of(TE->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && isFixedVectorShuffle(TE->Scalars, Mask)) || (TE->State == TreeEntry::NeedToGather && TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); @@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, + InsertElementInst *V) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU || IE1 == V) + return true; + if (IE1) { + if (IE1 != VU && !IE1->hasOneUse()) + IE1 = nullptr; + else + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + } + if (IE2) { + if (IE2 != V && !IE2->hasOneUse()) + IE2 = nullptr; + else + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } + } while (IE1 || IE2); + return false; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) + if (!isa_and_nonnull<InsertElementInst>(EU.User) && + !ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be @@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. - if (isa_and_nonnull<InsertElementInst>(EU.User)) { - if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { - Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { + Optional<int> InsertIdx = getInsertIndex(VU, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - Value *VU = EU.User; auto *It = find_if(FirstUsers, [VU](Value *V) { - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - auto *IE1 = cast<InsertElementInst>(VU); - auto *IE2 = cast<InsertElementInst>(V); - // Go through of insertelement instructions trying to find either VU - // as the original vector for IE2 or V as the original vector for IE1. - do { - if (IE1 == VU || IE2 == V) - return true; - if (IE1) - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - if (IE2) - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } while (IE1 || IE2); - return false; + return areTwoInsertFromSameBuildVector(VU, + cast<InsertElementInst>(V)); }); int VecId = -1; if (It == FirstUsers.end()) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (isa<InsertElementInst>(Base)) { + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(Base)) { + VU = cast<InsertElementInst>(Base); + do { + int Idx = E->findLaneForValue(Base); + ShuffleMask.back()[Idx] = Idx; + Base = cast<InsertElementInst>(Base)->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast<InsertElementInst>(Base)->getOperand(0); + } + FirstUsers.push_back(VU); DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { @@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; DemandedElts[VecId].setBit(Idx); + continue; } } @@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + if (FirstUsers.size() == 1) { + int Limit = ShuffleMask.front().size() * 2; + if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + cast<FixedVectorType>(FirstUsers.front()->getType()), + ShuffleMask.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users " << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; - continue; } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); + cast<FixedVectorType>(FirstUsers.front()->getType()), + DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); Cost -= InsertCost; + } else if (FirstUsers.size() >= 2) { + unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); + // Combined masks of the first 2 vectors. + SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); + copy(ShuffleMask.front(), CombinedMask.begin()); + APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); + auto *VecTy = FixedVectorType::get( + cast<VectorType>(FirstUsers.front()->getType())->getElementType(), + MaxVF); + for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { + if (ShuffleMask[1][I] != UndefMaskElem) { + CombinedMask[I] = ShuffleMask[1][I] + MaxVF; + CombinedDemandedElts.setBit(I); + } + } + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost << " for insertelements gather.\n" << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + // Other elements - permutation of 2 vectors (the initial one and the + // next Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int Mask = ShuffleMask[I][Idx]; + if (Mask != UndefMaskElem) + CombinedMask[Idx] = MaxVF + Mask; + else if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + } + for (unsigned Idx = VF; Idx < MaxVF; ++Idx) + if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } } #ifndef NDEBUG @@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) return Vec; - GatherSeq.insert(InsElt); + GatherShuffleSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder { const unsigned VF = 0; bool IsFinalized = false; SmallVector<int, 4> Mask; + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> &GatherShuffleSeq; + /// A list of blocks that we are going to CSE. + SetVector<BasicBlock *> &CSEBlocks; public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) - : Builder(Builder), VF(VF) {} + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, + SetVector<Instruction *> &GatherShuffleSeq, + SetVector<BasicBlock *> &CSEBlocks) + : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), + CSEBlocks(CSEBlocks) {} /// Adds a mask, inverting it before applying. void addInversedMask(ArrayRef<unsigned> SubMask) { @@ -5804,7 +5983,12 @@ public: if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); + Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; } ~ShuffleInstructionBuilder() { @@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { std::iota(UniformMask.begin(), UniformMask.end(), 0); V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } return V; } @@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { VL = UniqueValues; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } return Vec; } @@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); if (E->State == TreeEntry::NeedToGather) { if (E->getMainOp()) setInsertPointAfterBundle(E); @@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } else { Vec = gather(E->Scalars); } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = Vec; return Vec; @@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IsIdentity &= *InsertIdx - Offset == I; Mask[*InsertIdx - Offset] = I; } - if (!IsIdentity || NumElts != NumScalars) + if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } if ((!IsIdentity || Offset != 0 || - !isa<UndefValue>(FirstInsert->getOperand(0))) && + !isUndefVector(FirstInsert->getOperand(0))) && NumElts != NumScalars) { SmallVector<int> InsertMask(NumElts); std::iota(InsertMask.begin(), InsertMask.end(), 0); @@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = Builder.CreateShuffleVector( FirstInsert->getOperand(0), V, InsertMask, cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } ++NumVectorInstructions; @@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V1 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } + // Add V0 and V1 to later analysis to try to find and remove matching + // instruction, if any. + for (Value *V : {V0, V1}) { + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to @@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, Mask); - if (Instruction *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { V = propagateMetadata(I, E->Scalars); + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { + for (Instruction *I : GatherShuffleSeq) { if (isDeleted(I)) continue; @@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); - if (Op0 && L->contains(Op0)) - continue; - if (Op1 && L->contains(Op1)) + if (any_of(I->operands(), [L](Value *V) { + auto *OpI = dyn_cast<Instruction>(V); + return OpI && L->contains(OpI); + })) continue; // We can hoist this instruction. Move it to the pre-header. @@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() { return A->getDFSNumIn() < B->getDFSNumIn(); }); - // Perform O(N^2) search over the gather sequences and merge identical + // Less defined shuffles can be replaced by the more defined copies. + // Between two shuffles one is less defined if it has the same vector operands + // and its mask indeces are the same as in the first one or undefs. E.g. + // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, + // poison, <0, 0, 0, 0>. + auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, + SmallVectorImpl<int> &NewMask) { + if (I1->getType() != I2->getType()) + return false; + auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); + auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); + if (!SI1 || !SI2) + return I1->isIdenticalTo(I2); + if (SI1->isIdenticalTo(SI2)) + return true; + for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) + if (SI1->getOperand(I) != SI2->getOperand(I)) + return false; + // Check if the second instruction is more defined than the first one. + NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); + ArrayRef<int> SM1 = SI1->getShuffleMask(); + // Count trailing undefs in the mask to check the final number of used + // registers. + unsigned LastUndefsCnt = 0; + for (int I = 0, E = NewMask.size(); I < E; ++I) { + if (SM1[I] == UndefMaskElem) + ++LastUndefsCnt; + else + LastUndefsCnt = 0; + if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + NewMask[I] != SM1[I]) + return false; + if (NewMask[I] == UndefMaskElem) + NewMask[I] = SM1[I]; + } + // Check if the last undefs actually change the final number of used vector + // registers. + return SM1.size() - LastUndefsCnt > 1 && + TTI->getNumberOfParts(SI1->getType()) == + TTI->getNumberOfParts( + FixedVectorType::get(SI1->getType()->getElementType(), + SM1.size() - LastUndefsCnt)); + }; + // Perform O(N^2) search over the gather/shuffle sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; @@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() { if (isDeleted(&In)) continue; if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && - !isa<ShuffleVectorInst>(&In)) + !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. bool Replaced = false; - for (Instruction *v : Visited) { - if (In.isIdenticalTo(v) && - DT->dominates(v->getParent(), In.getParent())) { - In.replaceAllUsesWith(v); + for (Instruction *&V : Visited) { + SmallVector<int> NewMask; + if (IsIdenticalOrLessDefined(&In, V, NewMask) && + DT->dominates(V->getParent(), In.getParent())) { + In.replaceAllUsesWith(V); eraseInstruction(&In); + if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + Replaced = true; + break; + } + if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && + GatherShuffleSeq.contains(V) && + IsIdenticalOrLessDefined(V, &In, NewMask) && + DT->dominates(In.getParent(), V->getParent())) { + In.moveAfter(V); + V->replaceAllUsesWith(&In); + eraseInstruction(V); + if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + V = &In; Replaced = true; break; } @@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() { } } CSEBlocks.clear(); - GatherSeq.clear(); + GatherShuffleSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) @@ -8791,6 +9057,8 @@ private: assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, @@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - (llvm::all_of(BuildVectorOpds, - [](Value *V) { return isa<ExtractElementInst>(V); }) && + (llvm::all_of( + BuildVectorOpds, + [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; @@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, return tryToVectorizeList(BuildVectorInsts, R); } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { - bool OpsChanged = false; - SmallVector<Instruction *, 4> PostponedCmps; - for (auto *I : reverse(Instructions)) { - if (R.isDeleted(I)) - continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) - OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) - OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) - PostponedCmps.push_back(I); - } - if (AtTerminator) { - // Try to find reductions first. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - for (Value *Op : I->operands()) - OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); - } - // Try to vectorize operands as vector bundles. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - OpsChanged |= tryToVectorize(I, R); - } - Instructions.clear(); - } else { - // Insert in reverse order since the PostponedCmps vector was filled in - // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); - } - return OpsChanged; -} - template <typename T> static bool tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, @@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { + bool OpsChanged = false; + SmallVector<Instruction *, 4> PostponedCmps; + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) + continue; + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (isa<CmpInst>(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + // Try to vectorize list of compares. + // Sort by type, compare predicate, etc. + // TODO: Add analysis on the operand opcodes (profitable to vectorize + // instructions with same/alternate opcodes/const values). + auto &&CompareSorter = [&R](Value *V, Value *V2) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return true; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + return CI1->getPredicate() < CI2->getPredicate() || + (CI1->getPredicate() > CI2->getPredicate() && + CI1->getPredicate() < + CmpInst::getSwappedPredicate(CI2->getPredicate())); + }; + + auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + if (V1 == V2) + return true; + auto *CI1 = cast<CmpInst>(V1); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) + return false; + return CI1->getPredicate() == CI2->getPredicate() || + CI1->getPredicate() == + CmpInst::getSwappedPredicate(CI2->getPredicate()); + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; + + SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); + OpsChanged |= tryToVectorizeSequence<Value>( + Vals, Limit, CompareSorter, AreCompatibleCompares, + [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { + // Exclude possible reductions from other blocks. + bool ArePossiblyReducedInOtherBlock = + any_of(Candidates, [](Value *V) { + return any_of(V->users(), [V](User *U) { + return isa<SelectInst>(U) && + cast<SelectInst>(U)->getParent() != + cast<Instruction>(V)->getParent(); + }); + }); + if (ArePossiblyReducedInOtherBlock) + return false; + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + } + return OpsChanged; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 638467f94e1c..44b5e1df0839 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); + IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); + State.Builder.setFastMathFlags(FMF); for (unsigned Part = 0; Part < State.UF; ++Part) generateInstruction(State, Part); } @@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << Instruction::getOpcodeName(getOpcode()); } + O << FMF; + for (const VPValue *Operand : operands()) { O << " "; Operand->printAsOperand(O, SlotTracker); @@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, } #endif +void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { + // Make sure the VPInstruction is a floating-point operation. + assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp) && + "this op can't take fast-math flags"); + FMF = FMFNew; +} + /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. @@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); - O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) - << " ("; + O << " +"; + if (isa<FPMathOperator>(getUnderlyingInstr())) + O << getUnderlyingInstr()->getFastMathFlags(); + O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 00ee31007cb7..810dd5030f95 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -59,6 +59,7 @@ class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPReplicateRecipe; class VPlanSlp; /// Returns a calculation for the total number of elements for a given \p VF. @@ -346,6 +347,10 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; + + /// Holds recipes that may generate a poison value that is used after + /// vectorization, even when their operands are not poison. + SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes; }; /// VPUsers instance used by VPBlockBase to manage CondBit and the block @@ -789,6 +794,7 @@ public: private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; + FastMathFlags FMF; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -802,13 +808,6 @@ public: : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} - VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { - for (auto *I : Operands) - addOperand(I->getVPSingleValue()); - } - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} @@ -870,6 +869,9 @@ public: return true; } } + + /// Set the fast-math flags. + void setFastMathFlags(FastMathFlags FMFNew); }; /// VPWidenRecipe is a recipe for producing a copy of vector type its @@ -1511,7 +1513,7 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue { Instruction &Ingredient; // Whether the loaded-from / stored-to addresses are consecutive. @@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, bool Consecutive, bool Reverse) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), + VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } @@ -1544,6 +1546,7 @@ public: VPValue *StoredValue, VPValue *Mask, bool Consecutive, bool Reverse) : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), + VPValue(VPValue::VPVMemoryInstructionSC, &Store, this), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); setMask(Mask); |
