diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
9 files changed, 375 insertions, 312 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index fa2459d1ca02..1f11d4894f77 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1193,7 +1193,7 @@ std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( OpA->getType() != OpB->getType()) return std::nullopt; - uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); + uint64_t Stride = GTIA.getSequentialElementStride(DL); // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 577ce8000de2..cff72ae263d8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,9 +167,14 @@ public: } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL, const Twine &Name = "") { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, - Name); + DebugLoc DL, const Twine &Name = "", + std::optional<FastMathFlags> FMFs = std::nullopt) { + auto *Select = + FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, + *FMFs, DL, Name) + : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, + DL, Name); + return tryInsertInstruction(Select); } /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A @@ -341,16 +346,20 @@ public: /// Return the best VPlan for \p VF. VPlan &getBestPlanFor(ElementCount VF) const; - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected \p VF, \p UF and VPlan \p BestPlan. + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan + /// according to the best selected \p VF and \p UF. + /// /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue /// vectorization re-using plans for both the main and epilogue vector loops. /// It should be removed once the re-use issue has been fixed. /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop - /// to re-use expansion results generated during main plan execution. Returns - /// a mapping of SCEVs to their expanded IR values. Note that this is a - /// temporary workaround needed due to the current epilogue handling. - DenseMap<const SCEV *, Value *> + /// to re-use expansion results generated during main plan execution. + /// + /// Returns a mapping of SCEVs to their expanded IR values and a mapping for + /// the reduction resume values. Note that this is a temporary workaround + /// needed due to the current epilogue handling. + std::pair<DenseMap<const SCEV *, Value *>, + DenseMap<const RecurrenceDescriptor *, Value *>> executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8e135d80f4f2..51ce88480c08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -584,10 +584,6 @@ public: /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - // Returns the resume value (bc.merge.rdx) for a reduction as - // generated by fixReduction. - PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); - /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases @@ -626,9 +622,6 @@ protected: BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State); - /// Handle all cross-iteration phis in the header. - void fixCrossIterationPHIs(VPTransformState &State); - /// Create the exit value of first order recurrences in the middle block and /// update their users. void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, @@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } } -PHINode *InnerLoopVectorizer::getReductionResumeValue( - const RecurrenceDescriptor &RdxDesc) { - auto It = ReductionResumeValues.find(&RdxDesc); - assert(It != ReductionResumeValues.end() && - "Expected to find a resume value for the reduction."); - return It->second; -} - namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. - // This is the second stage of vectorizing recurrences. - fixCrossIterationPHIs(State); + // This is the second stage of vectorizing recurrences. Note that fixing + // reduction phis are already modeled in VPlan. + // TODO: Also model fixing fixed-order recurrence phis in VPlan. + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : HeaderVPBB->phis()) { + if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + fixFixedOrderRecurrence(FOR, State); + } // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); @@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted @@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #2: We now need to fix the recurrences by adding incoming edges to - // the currently empty PHI nodes. At this point every instruction in the - // original loop is widened to a vector form so we can use them to construct - // the incoming edges. - VPBasicBlock *Header = - State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) - fixReduction(ReductionPhi, State); - } - - for (VPRecipeBase &R : Header->phis()) { - if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) - fixFixedOrderRecurrence(FOR, State); - } -} - void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An @@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Phi->setName("scalar.recur"); } -void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, - VPTransformState &State) { - PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); - // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(OrigPhi) && - "Unable to find the reduction variable"); - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - - RecurKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue)) - State.setDebugLocFrom(I->getDebugLoc()); - - VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - - State.setDebugLocFrom(LoopExitInst->getDebugLoc()); - - Type *PhiTy = OrigPhi->getType(); - // If tail is folded by masking, the vector value to leave the loop should be - // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. For an inloop reduction the reduction will already - // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { - VPValue *Def = nullptr; - for (VPUser *U : LoopExitInstDef->users()) { - auto *S = dyn_cast<VPInstruction>(U); - if (S && S->getOpcode() == Instruction::Select) { - Def = S; - break; - } - } - if (Def) - LoopExitInstDef = Def; - } - - VectorParts RdxParts(UF); - for (unsigned Part = 0; Part < UF; ++Part) - RdxParts[Part] = State.get(LoopExitInstDef, Part); - - // If the vector reduction can be performed in a smaller type, we truncate - // then extend the loop exit value to enable InstCombine to evaluate the - // entire expression in the smaller type. - if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - } - } - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getOpcode(RK); - - // The middle block terminator has already been assigned a DebugLoc here (the - // OrigLoop's single latch terminator). We want the whole middle block to - // appear to execute on this line because: (a) it is all compiler generated, - // (b) these instructions are always executed after evaluating the latch - // conditional branch, and (c) other passes may add new predecessors which - // terminate on this line. This is the easiest way to ensure we don't - // accidentally cause an extra step back into the loop while debugging. - State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc()); - if (PhiR->isOrdered()) - ReducedPartRdx = RdxParts[UF - 1]; - else { - // Floating-point operations should have some FMF to enable the reduction. - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = RdxParts[Part]; - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } - } - - // Create the reduction after the loop. Note that inloop reductions create the - // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (PhiTy != RdxDesc.getRecurrenceType()) - ReducedPartRdx = RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, PhiTy) - : Builder.CreateZExt(ReducedPartRdx, PhiTy); - } - - PHINode *ResumePhi = - dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); - - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - - // If we are fixing reductions in the epilogue loop then we should already - // have created a bc.merge.rdx Phi after the main vector body. Ensure that - // we carry over the incoming values correctly. - for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); - else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); - } - - // Set the resume value for this reduction - ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); - - // If there were stores of the reduction value to a uniform memory address - // inside the loop, create the final store here. - if (StoreInst *SI = RdxDesc.IntermediateStore) { - StoreInst *NewSI = - Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(), - SI->getAlign()); - propagateMetadata(NewSI, SI); - - // If the reduction value is used in other places, - // then let the code below create PHI's for that. - } - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - - // We know that the loop is in LCSSA form. We need to update the PHI nodes - // in the exit blocks. See comment on analogous loop in - // fixFixedOrderRecurrence for a more complete explaination of the logic. - if (!Cost->requiresScalarEpilogue(VF.isVector())) - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { - LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - State.Plan->removeLiveOut(&LCSSAPhi); - } - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF, provided it - // is at least 1. - // - // For scalable vectors we can't know if interleaving is beneficial. It may - // not be beneficial for small loops if none of the lanes in the second vector - // iterations is enabled. However, for larger loops, there is likely to be a - // similar benefit as for fixed-width vectors. For now, we choose to leave - // the InterleaveCount as if vscale is '1', although if some information about - // the vector is known (e.g. min vector size), we can make a better decision. - if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); - // Make sure MaxInterleaveCount is greater than 0. - MaxInterleaveCount = std::max(1u, MaxInterleaveCount); + unsigned EstimatedVF = VF.getKnownMinValue(); + if (VF.isScalable()) { + if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) + EstimatedVF *= *VScale; + } + assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); + + unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + if (KnownTC) { + // If trip count is known we select between two prospective ICs, where + // 1) the aggressive IC is capped by the trip count divided by VF + // 2) the conservative IC is capped by the trip count divided by (VF * 2) + // The final IC is selected in a way that the epilogue loop trip count is + // minimized while maximizing the IC itself, so that we either run the + // vector loop at least once if it generates a small epilogue loop, or else + // we run the vector loop at least twice. + + unsigned InterleaveCountUB = bit_floor( + std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); + unsigned InterleaveCountLB = bit_floor(std::max( + 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + MaxInterleaveCount = InterleaveCountLB; + + if (InterleaveCountUB != InterleaveCountLB) { + unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); + unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); + // If both produce same scalar tail, maximize the IC to do the same work + // in fewer vector loop iterations + if (TailTripCountUB == TailTripCountLB) + MaxInterleaveCount = InterleaveCountUB; + } + } else if (BestKnownTC) { + // If trip count is an estimated compile time constant, limit the + // IC to be capped by the trip count divided by VF * 2, such that the vector + // loop runs at least twice to make interleaving seem profitable when there + // is an epilogue loop present. Since exact Trip count is not known we + // choose to be conservative in our IC estimate. + MaxInterleaveCount = bit_floor(std::max( + 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); } assert(MaxInterleaveCount > 0 && @@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -SCEV2ValueTy LoopVectorizationPlanner::executePlan( +// Check if \p RedResult is a ComputeReductionResult instruction, and if it is +// create a merge phi node for it and add it to \p ReductionResumeValues. +static void createAndCollectMergePhiForReduction( + VPInstruction *RedResult, + DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, + VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { + if (!RedResult || + RedResult->getOpcode() != VPInstruction::ComputeReductionResult) + return; + + auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Value *FinalValue = + State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); + auto *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + + // TODO: bc.merge.rdx should not be created here, instead it should be + // modeled in VPlan. + BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(FinalValue, Incoming); + else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + // TODO: This fixup should instead be modeled in VPlan. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + + ReductionResumeValues[&RdxDesc] = BCBlockPhi; +} + +std::pair<DenseMap<const SCEV *, Value *>, + DenseMap<const RecurrenceDescriptor *, Value *>> +LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { @@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); + // 2.5 Collect reduction resume values. + DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; + auto *ExitVPBB = + cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + for (VPRecipeBase &R : *ExitVPBB) { + createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), + ReductionResumeValues, State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB]); + } + + // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). MDNode *OrigLoopID = OrigLoop->getLoopID(); @@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); - return State.ExpandedSCEVs; + return {State.ExpandedSCEVs, ReductionResumeValues}; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (ECEntryIt != EdgeMaskCache.end()) return ECEntryIt->second; - VPValue *SrcMask = createBlockInMask(Src, Plan); + VPValue *SrcMask = getBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); @@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { BlockMaskCache[Header] = BlockMask; } -VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - - // Look for cached value. - BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); - if (BCEntryIt != BlockMaskCache.end()) - return BCEntryIt->second; +VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { + // Return the cached value. + BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); + assert(BCEntryIt != BlockMaskCache.end() && + "Trying to access mask for block without one."); + return BCEntryIt->second; +} +void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); assert(OrigLoop->getHeader() != BB && "Loop header must have cached block mask"); @@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); - if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. - return BlockMaskCache[BB] = EdgeMask; + if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. + BlockMaskCache[BB] = EdgeMask; + } if (!BlockMask) { // BlockMask has its initialized nullptr value. BlockMask = EdgeMask; @@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } - return BlockMaskCache[BB] = BlockMask; + BlockMaskCache[BB] = BlockMask; } VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, @@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), *Plan); + Mask = getBlockInMask(I->getParent()); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; if (Consecutive) { - auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), - Reverse, I->getDebugLoc()); + auto *GEP = dyn_cast<GetElementPtrInst>( + Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, + I->getDebugLoc()); Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } @@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // all-true mask. VPValue *Mask = nullptr; if (Legal->isMaskRequired(CI)) - Mask = createBlockInMask(CI->getParent(), *Plan); + Mask = getBlockInMask(CI->getParent()); else Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); @@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); - VPValue *Mask = createBlockInMask(I->getParent(), *Plan); + VPValue *Mask = getBlockInMask(I->getParent()); VPValue *One = Plan->getVPValueOrAddLiveIn( ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = @@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, // added initially. Masked replicate recipes will later be placed under an // if-then construct to prevent side-effects. Generate recipes to compute // the block mask for this region. - BlockInMask = createBlockInMask(I->getParent(), Plan); + BlockInMask = getBlockInMask(I->getParent()); } auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), @@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - // Proactively create header mask. Masks for other blocks are created on - // demand. - RecipeBuilder.createHeaderMask(*Plan); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; + bool NeedsMasks = CM.foldTailByMasking() || + any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + }); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); + if (VPBB == HeaderVPBB) + RecipeBuilder.createHeaderMask(*Plan); + else if (NeedsMasks) + RecipeBuilder.createBlockInMask(BB, *Plan); + // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { @@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // to reductions, with one operand being vector and the other being the scalar // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. +// +// A ComputeReductionResult recipe is added to the middle block, also for +// in-loop reductions which compute their result in-loop, because generating +// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { - VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores // sank outside of the loop would keep the same order as they had in the // original loop. @@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase *R : ReductionPHIList) R->moveBefore(*Header, Header->getFirstNonPhi()); - SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis; for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) continue; - InLoopReductionPhis.push_back(PhiR); - } - for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) { const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind Kind = RdxDesc.getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && @@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(CurrentLink); - CondOp = RecipeBuilder.createBlockInMask(BB, *Plan); + CondOp = RecipeBuilder.getBlockInMask(BB); } VPReductionRecipe *RedRecipe = new VPReductionRecipe( @@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); - if (!PhiR || PhiR->isInLoop()) + if (!PhiR) continue; const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe(); // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { - VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); - VPValue *Red = PhiR->getBackedgeValue(); - assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && + auto *OrigExitingVPV = PhiR->getBackedgeValue(); + auto *NewExitingVPV = PhiR->getBackedgeValue(); + if (!PhiR->isInLoop() && CM.foldTailByMasking()) { + VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); + assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); - FastMathFlags FMFs = RdxDesc.getFastMathFlags(); Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); - Result = + std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() - ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs) - : new VPInstruction(Instruction::Select, {Cond, Red, PhiR}); - Result->insertBefore(&*Builder.getInsertPoint()); - Red->replaceUsesWithIf( - Result->getVPSingleValue(), - [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); }); + ? std::make_optional(RdxDesc.getFastMathFlags()) + : std::nullopt; + NewExitingVPV = + Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); + OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { + return isa<VPInstruction>(&U) && + cast<VPInstruction>(&U)->getOpcode() == + VPInstruction::ComputeReductionResult; + }); if (PreferPredicatedReductionSelect || TTI.preferPredicatedReductionSelect( PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) - PhiR->setOperand(1, Result->getVPSingleValue()); + PhiR->setOperand(1, NewExitingVPV); } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxTy = RdxDesc.getRecurrenceType(); - auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc, - Result->getVPSingleValue(), RdxTy); + auto *Trunc = + new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); auto *Extnd = RdxDesc.isSigned() ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); - Trunc->insertAfter(Result); + Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); Extnd->insertAfter(Trunc); - Result->getVPSingleValue()->replaceAllUsesWith(Extnd); - Trunc->setOperand(0, Result->getVPSingleValue()); + if (PhiR->getOperand(1) == NewExitingVPV) + PhiR->setOperand(1, Extnd->getVPSingleValue()); + NewExitingVPV = Extnd; } + + // We want code in the middle block to appear to execute on the location of + // the scalar loop's latch terminator because: (a) it is all compiler + // generated, (b) these instructions are always executed after evaluating + // the latch conditional branch, and (c) other passes may add new + // predecessors which terminate on this line. This is the easiest way to + // ensure we don't accidentally cause an extra step back into the loop while + // debugging. + DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); + + // TODO: At the moment ComputeReductionResult also drives creation of the + // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here + // even for in-loop reductions, until the reduction resume value handling is + // also modeled in VPlan. + auto *FinalReductionResult = new VPInstruction( + VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) + ->appendRecipe(FinalReductionResult); + OrigExitingVPV->replaceUsesWithIf( + FinalReductionResult, + [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); } VPlanTransforms::clearReductionWrapFlags(*Plan); @@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - BestMainPlan, MainILV, DT, true); + const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( + EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { - ResumeV = MainILV.getReductionResumeValue( - ReductionPhi->getRecurrenceDescriptor()); + ResumeV = ReductionResumeValues + .find(&ReductionPhi->getRecurrenceDescriptor()) + ->second; } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 304991526064..8e22b54f002d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10596,7 +10596,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) { + auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF, + unsigned I, unsigned SliceSize) { if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { return isa<UndefValue>(V) && !isa<PoisonValue>(V); })) @@ -10619,11 +10620,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { Idx == 0) || (Mask.size() == InputVF && ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { - std::iota(Mask.begin(), Mask.end(), 0); + std::iota(std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), (I + 1) * SliceSize), 0); } else { - unsigned I = + unsigned IVal = *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); - std::fill(Mask.begin(), Mask.end(), I); + std::fill(std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), (I + 1) * SliceSize), IVal); } return true; }; @@ -10872,7 +10875,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } else if (Vec1) { IsUsedInExpr &= FindReusedSplat( ExtractMask, - cast<FixedVectorType>(Vec1->getType())->getNumElements()); + cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0, + ExtractMask.size()); ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { @@ -10898,7 +10902,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); if (TEs.size() == 1) { IsUsedInExpr &= - FindReusedSplat(VecMask, TEs.front()->getVectorFactor()); + FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize); ShuffleBuilder.add(*TEs.front(), VecMask); if (TEs.front()->VectorizedValue) IsNonPoisoned &= @@ -11139,6 +11143,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); + if (const TreeEntry *TE = getTreeEntry(V)) + V = TE->VectorizedValue; setInsertPointAfterBundle(E); V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; @@ -11903,8 +11909,10 @@ Value *BoUpSLP::vectorizeTree( if (!Ex) { // "Reuse" the existing extract to improve final codegen. if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { - Ex = Builder.CreateExtractElement(ES->getOperand(0), - ES->getOperand(1)); + Value *V = ES->getVectorOperand(); + if (const TreeEntry *ETE = getTreeEntry(V)) + V = ETE->VectorizedValue; + Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); } else { Ex = Builder.CreateExtractElement(Vec, Lane); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 7ff6749a0908..4b3143aead46 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -138,8 +138,11 @@ public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True or the loop mask when - /// tail folding. It returns the *entry* mask for the block BB. - VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); + /// tail folding. + void createBlockInMask(BasicBlock *BB, VPlan &Plan); + + /// Returns the *entry* mask for the block \p BB. + VPValue *getBlockInMask(BasicBlock *BB) const; /// A helper function that computes the predicate of the edge between SRC /// and DST. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d7df9c9575a..b6e56c47c227 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) { // ExitBB can be re-used for the exit block of the Plan. NewBB = State->CFG.ExitBB; State->CFG.PrevBB = NewBB; + State->Builder.SetInsertPoint(NewBB->getFirstNonPHI()); // Update the branch instruction in the predecessor to branch to ExitBB. VPBlockBase *PredVPB = getSingleHierarchicalPredecessor(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 7d33baac52c9..4b4f4911eb64 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -842,6 +842,12 @@ public: WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {} }; +protected: + struct GEPFlagsTy { + char IsInBounds : 1; + GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {} + }; + private: struct DisjointFlagsTy { char IsDisjoint : 1; @@ -849,9 +855,6 @@ private: struct ExactFlagsTy { char IsExact : 1; }; - struct GEPFlagsTy { - char IsInBounds : 1; - }; struct NonNegFlagsTy { char NonNeg : 1; }; @@ -933,12 +936,21 @@ public: : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp), FMFs(FMFs) {} +protected: + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + GEPFlagsTy GEPFlags, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL), OpType(OperationType::GEPOp), + GEPFlags(GEPFlags) {} + +public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPInstructionSC || R->getVPDefID() == VPRecipeBase::VPWidenSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || - R->getVPDefID() == VPRecipeBase::VPReplicateSC; + R->getVPDefID() == VPRecipeBase::VPReplicateSC || + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } /// Drop all poison-generating flags. @@ -1061,7 +1073,8 @@ public: // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, BranchOnCount, - BranchOnCond + BranchOnCond, + ComputeReductionResult, }; private: @@ -1360,15 +1373,16 @@ public: /// A recipe to compute the pointers for widened memory accesses of IndexTy for /// all parts. If IsReverse is true, compute pointers for accessing the input in /// reverse order per part. -class VPVectorPointerRecipe : public VPRecipeBase, public VPValue { +class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue { Type *IndexedTy; bool IsReverse; public: VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse, - DebugLoc DL) - : VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this), - IndexedTy(IndexedTy), IsReverse(IsReverse) {} + bool IsInBounds, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), + GEPFlagsTy(IsInBounds), DL), + VPValue(this), IndexedTy(IndexedTy), IsReverse(IsReverse) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -3132,6 +3146,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { return Rep->isUniform(); if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def)) return all_of(GEP->operands(), isUniformAfterVectorization); + if (auto *VPI = dyn_cast<VPInstruction>(Def)) + return VPI->getOpcode() == VPInstruction::ComputeReductionResult; return false; } } // end namespace vputils diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 76961629aece..1f844bce2310 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> @@ -119,7 +120,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { return false; case VPInstructionSC: switch (cast<VPInstruction>(this)->getOpcode()) { + case Instruction::Or: case Instruction::ICmp: + case Instruction::Select: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: @@ -401,6 +404,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::ComputeReductionResult: { + if (Part != 0) + return State.get(this, 0); + + // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary + // and will be removed by breaking up the recipe further. + auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + // Get its reduction variable descriptor. + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + + RecurKind RK = RdxDesc.getRecurrenceKind(); + + State.setDebugLocFrom(getDebugLoc()); + + VPValue *LoopExitingDef = getOperand(1); + Type *PhiTy = OrigPhi->getType(); + VectorParts RdxParts(State.UF); + for (unsigned Part = 0; Part < State.UF; ++Part) + RdxParts[Part] = State.get(LoopExitingDef, Part); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + // TODO: Handle this in truncateToMinBW. + if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); + for (unsigned Part = 0; Part < State.UF; ++Part) + RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); + } + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = RecurrenceDescriptor::getOpcode(RK); + + if (PhiR->isOrdered()) { + ReducedPartRdx = RdxParts[State.UF - 1]; + } else { + // Floating-point operations should have some FMF to enable the reduction. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + for (unsigned Part = 1; Part < State.UF; ++Part) { + Value *RdxPart = RdxParts[Part]; + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + TrackingVH<Value> ReductionStartValue = + RdxDesc.getRecurrenceStartValue(); + ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + } else + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } + } + + // Create the reduction after the loop. Note that inloop reductions create + // the target reduction in the loop using a Reduction recipe. + if (State.VF.isVector() && !PhiR->isInLoop()) { + ReducedPartRdx = + createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (PhiTy != RdxDesc.getRecurrenceType()) + ReducedPartRdx = RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, PhiTy) + : Builder.CreateZExt(ReducedPartRdx, PhiTy); + } + + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + auto *NewSI = Builder.CreateAlignedStore( + ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); + propagateMetadata(NewSI, SI); + } + + return ReducedPartRdx; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -477,6 +558,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCount: O << "branch-on-count"; break; + case VPInstruction::ComputeReductionResult: + O << "compute-reduction-result"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1225,9 +1309,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { ? DL.getIndexType(IndexedTy->getPointerTo()) : Builder.getInt32Ty(); Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); - bool InBounds = false; - if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = GEP->isInBounds(); + bool InBounds = isInBounds(); if (IsReverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 33132880d5a4..5c430620a2dc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -829,15 +829,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Type *ATy = TypeInfo.inferScalarType(A); if (TruncTy == ATy) { Trunc->replaceAllUsesWith(A); - } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { - auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); + } else { + // Don't replace a scalarizing recipe with a widened cast. + if (isa<VPReplicateRecipe>(&R)) + break; + if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } } #ifndef NDEBUG // Verify that the cached type info is for both A and its users is still |