diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 481 |
1 files changed, 210 insertions, 271 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8e135d80f4f2..51ce88480c08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -584,10 +584,6 @@ public: /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - // Returns the resume value (bc.merge.rdx) for a reduction as - // generated by fixReduction. - PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); - /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases @@ -626,9 +622,6 @@ protected: BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State); - /// Handle all cross-iteration phis in the header. - void fixCrossIterationPHIs(VPTransformState &State); - /// Create the exit value of first order recurrences in the middle block and /// update their users. void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, @@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } } -PHINode *InnerLoopVectorizer::getReductionResumeValue( - const RecurrenceDescriptor &RdxDesc) { - auto It = ReductionResumeValues.find(&RdxDesc); - assert(It != ReductionResumeValues.end() && - "Expected to find a resume value for the reduction."); - return It->second; -} - namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. - // This is the second stage of vectorizing recurrences. - fixCrossIterationPHIs(State); + // This is the second stage of vectorizing recurrences. Note that fixing + // reduction phis are already modeled in VPlan. + // TODO: Also model fixing fixed-order recurrence phis in VPlan. + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : HeaderVPBB->phis()) { + if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + fixFixedOrderRecurrence(FOR, State); + } // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); @@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted @@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #2: We now need to fix the recurrences by adding incoming edges to - // the currently empty PHI nodes. At this point every instruction in the - // original loop is widened to a vector form so we can use them to construct - // the incoming edges. - VPBasicBlock *Header = - State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) - fixReduction(ReductionPhi, State); - } - - for (VPRecipeBase &R : Header->phis()) { - if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) - fixFixedOrderRecurrence(FOR, State); - } -} - void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An @@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Phi->setName("scalar.recur"); } -void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, - VPTransformState &State) { - PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); - // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(OrigPhi) && - "Unable to find the reduction variable"); - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - - RecurKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue)) - State.setDebugLocFrom(I->getDebugLoc()); - - VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - - State.setDebugLocFrom(LoopExitInst->getDebugLoc()); - - Type *PhiTy = OrigPhi->getType(); - // If tail is folded by masking, the vector value to leave the loop should be - // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. For an inloop reduction the reduction will already - // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { - VPValue *Def = nullptr; - for (VPUser *U : LoopExitInstDef->users()) { - auto *S = dyn_cast<VPInstruction>(U); - if (S && S->getOpcode() == Instruction::Select) { - Def = S; - break; - } - } - if (Def) - LoopExitInstDef = Def; - } - - VectorParts RdxParts(UF); - for (unsigned Part = 0; Part < UF; ++Part) - RdxParts[Part] = State.get(LoopExitInstDef, Part); - - // If the vector reduction can be performed in a smaller type, we truncate - // then extend the loop exit value to enable InstCombine to evaluate the - // entire expression in the smaller type. - if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - } - } - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getOpcode(RK); - - // The middle block terminator has already been assigned a DebugLoc here (the - // OrigLoop's single latch terminator). We want the whole middle block to - // appear to execute on this line because: (a) it is all compiler generated, - // (b) these instructions are always executed after evaluating the latch - // conditional branch, and (c) other passes may add new predecessors which - // terminate on this line. This is the easiest way to ensure we don't - // accidentally cause an extra step back into the loop while debugging. - State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc()); - if (PhiR->isOrdered()) - ReducedPartRdx = RdxParts[UF - 1]; - else { - // Floating-point operations should have some FMF to enable the reduction. - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = RdxParts[Part]; - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } - } - - // Create the reduction after the loop. Note that inloop reductions create the - // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (PhiTy != RdxDesc.getRecurrenceType()) - ReducedPartRdx = RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, PhiTy) - : Builder.CreateZExt(ReducedPartRdx, PhiTy); - } - - PHINode *ResumePhi = - dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); - - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - - // If we are fixing reductions in the epilogue loop then we should already - // have created a bc.merge.rdx Phi after the main vector body. Ensure that - // we carry over the incoming values correctly. - for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); - else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); - } - - // Set the resume value for this reduction - ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); - - // If there were stores of the reduction value to a uniform memory address - // inside the loop, create the final store here. - if (StoreInst *SI = RdxDesc.IntermediateStore) { - StoreInst *NewSI = - Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(), - SI->getAlign()); - propagateMetadata(NewSI, SI); - - // If the reduction value is used in other places, - // then let the code below create PHI's for that. - } - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - - // We know that the loop is in LCSSA form. We need to update the PHI nodes - // in the exit blocks. See comment on analogous loop in - // fixFixedOrderRecurrence for a more complete explaination of the logic. - if (!Cost->requiresScalarEpilogue(VF.isVector())) - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { - LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - State.Plan->removeLiveOut(&LCSSAPhi); - } - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF, provided it - // is at least 1. - // - // For scalable vectors we can't know if interleaving is beneficial. It may - // not be beneficial for small loops if none of the lanes in the second vector - // iterations is enabled. However, for larger loops, there is likely to be a - // similar benefit as for fixed-width vectors. For now, we choose to leave - // the InterleaveCount as if vscale is '1', although if some information about - // the vector is known (e.g. min vector size), we can make a better decision. - if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); - // Make sure MaxInterleaveCount is greater than 0. - MaxInterleaveCount = std::max(1u, MaxInterleaveCount); + unsigned EstimatedVF = VF.getKnownMinValue(); + if (VF.isScalable()) { + if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) + EstimatedVF *= *VScale; + } + assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); + + unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + if (KnownTC) { + // If trip count is known we select between two prospective ICs, where + // 1) the aggressive IC is capped by the trip count divided by VF + // 2) the conservative IC is capped by the trip count divided by (VF * 2) + // The final IC is selected in a way that the epilogue loop trip count is + // minimized while maximizing the IC itself, so that we either run the + // vector loop at least once if it generates a small epilogue loop, or else + // we run the vector loop at least twice. + + unsigned InterleaveCountUB = bit_floor( + std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); + unsigned InterleaveCountLB = bit_floor(std::max( + 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + MaxInterleaveCount = InterleaveCountLB; + + if (InterleaveCountUB != InterleaveCountLB) { + unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); + unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); + // If both produce same scalar tail, maximize the IC to do the same work + // in fewer vector loop iterations + if (TailTripCountUB == TailTripCountLB) + MaxInterleaveCount = InterleaveCountUB; + } + } else if (BestKnownTC) { + // If trip count is an estimated compile time constant, limit the + // IC to be capped by the trip count divided by VF * 2, such that the vector + // loop runs at least twice to make interleaving seem profitable when there + // is an epilogue loop present. Since exact Trip count is not known we + // choose to be conservative in our IC estimate. + MaxInterleaveCount = bit_floor(std::max( + 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); } assert(MaxInterleaveCount > 0 && @@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -SCEV2ValueTy LoopVectorizationPlanner::executePlan( +// Check if \p RedResult is a ComputeReductionResult instruction, and if it is +// create a merge phi node for it and add it to \p ReductionResumeValues. +static void createAndCollectMergePhiForReduction( + VPInstruction *RedResult, + DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, + VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { + if (!RedResult || + RedResult->getOpcode() != VPInstruction::ComputeReductionResult) + return; + + auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Value *FinalValue = + State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); + auto *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + + // TODO: bc.merge.rdx should not be created here, instead it should be + // modeled in VPlan. + BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(FinalValue, Incoming); + else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + // TODO: This fixup should instead be modeled in VPlan. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + + ReductionResumeValues[&RdxDesc] = BCBlockPhi; +} + +std::pair<DenseMap<const SCEV *, Value *>, + DenseMap<const RecurrenceDescriptor *, Value *>> +LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { @@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); + // 2.5 Collect reduction resume values. + DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; + auto *ExitVPBB = + cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + for (VPRecipeBase &R : *ExitVPBB) { + createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), + ReductionResumeValues, State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB]); + } + + // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). MDNode *OrigLoopID = OrigLoop->getLoopID(); @@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); - return State.ExpandedSCEVs; + return {State.ExpandedSCEVs, ReductionResumeValues}; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (ECEntryIt != EdgeMaskCache.end()) return ECEntryIt->second; - VPValue *SrcMask = createBlockInMask(Src, Plan); + VPValue *SrcMask = getBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); @@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { BlockMaskCache[Header] = BlockMask; } -VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - - // Look for cached value. - BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); - if (BCEntryIt != BlockMaskCache.end()) - return BCEntryIt->second; +VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { + // Return the cached value. + BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); + assert(BCEntryIt != BlockMaskCache.end() && + "Trying to access mask for block without one."); + return BCEntryIt->second; +} +void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); assert(OrigLoop->getHeader() != BB && "Loop header must have cached block mask"); @@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); - if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. - return BlockMaskCache[BB] = EdgeMask; + if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. + BlockMaskCache[BB] = EdgeMask; + } if (!BlockMask) { // BlockMask has its initialized nullptr value. BlockMask = EdgeMask; @@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } - return BlockMaskCache[BB] = BlockMask; + BlockMaskCache[BB] = BlockMask; } VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, @@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), *Plan); + Mask = getBlockInMask(I->getParent()); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; if (Consecutive) { - auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), - Reverse, I->getDebugLoc()); + auto *GEP = dyn_cast<GetElementPtrInst>( + Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, + I->getDebugLoc()); Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } @@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // all-true mask. VPValue *Mask = nullptr; if (Legal->isMaskRequired(CI)) - Mask = createBlockInMask(CI->getParent(), *Plan); + Mask = getBlockInMask(CI->getParent()); else Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); @@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); - VPValue *Mask = createBlockInMask(I->getParent(), *Plan); + VPValue *Mask = getBlockInMask(I->getParent()); VPValue *One = Plan->getVPValueOrAddLiveIn( ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = @@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, // added initially. Masked replicate recipes will later be placed under an // if-then construct to prevent side-effects. Generate recipes to compute // the block mask for this region. - BlockInMask = createBlockInMask(I->getParent(), Plan); + BlockInMask = getBlockInMask(I->getParent()); } auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), @@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - // Proactively create header mask. Masks for other blocks are created on - // demand. - RecipeBuilder.createHeaderMask(*Plan); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; + bool NeedsMasks = CM.foldTailByMasking() || + any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + }); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); + if (VPBB == HeaderVPBB) + RecipeBuilder.createHeaderMask(*Plan); + else if (NeedsMasks) + RecipeBuilder.createBlockInMask(BB, *Plan); + // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { @@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // to reductions, with one operand being vector and the other being the scalar // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. +// +// A ComputeReductionResult recipe is added to the middle block, also for +// in-loop reductions which compute their result in-loop, because generating +// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { - VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores // sank outside of the loop would keep the same order as they had in the // original loop. @@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase *R : ReductionPHIList) R->moveBefore(*Header, Header->getFirstNonPhi()); - SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis; for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) continue; - InLoopReductionPhis.push_back(PhiR); - } - for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) { const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind Kind = RdxDesc.getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && @@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(CurrentLink); - CondOp = RecipeBuilder.createBlockInMask(BB, *Plan); + CondOp = RecipeBuilder.getBlockInMask(BB); } VPReductionRecipe *RedRecipe = new VPReductionRecipe( @@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); - if (!PhiR || PhiR->isInLoop()) + if (!PhiR) continue; const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe(); // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { - VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); - VPValue *Red = PhiR->getBackedgeValue(); - assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && + auto *OrigExitingVPV = PhiR->getBackedgeValue(); + auto *NewExitingVPV = PhiR->getBackedgeValue(); + if (!PhiR->isInLoop() && CM.foldTailByMasking()) { + VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); + assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); - FastMathFlags FMFs = RdxDesc.getFastMathFlags(); Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); - Result = + std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() - ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs) - : new VPInstruction(Instruction::Select, {Cond, Red, PhiR}); - Result->insertBefore(&*Builder.getInsertPoint()); - Red->replaceUsesWithIf( - Result->getVPSingleValue(), - [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); }); + ? std::make_optional(RdxDesc.getFastMathFlags()) + : std::nullopt; + NewExitingVPV = + Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); + OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { + return isa<VPInstruction>(&U) && + cast<VPInstruction>(&U)->getOpcode() == + VPInstruction::ComputeReductionResult; + }); if (PreferPredicatedReductionSelect || TTI.preferPredicatedReductionSelect( PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) - PhiR->setOperand(1, Result->getVPSingleValue()); + PhiR->setOperand(1, NewExitingVPV); } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxTy = RdxDesc.getRecurrenceType(); - auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc, - Result->getVPSingleValue(), RdxTy); + auto *Trunc = + new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); auto *Extnd = RdxDesc.isSigned() ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); - Trunc->insertAfter(Result); + Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); Extnd->insertAfter(Trunc); - Result->getVPSingleValue()->replaceAllUsesWith(Extnd); - Trunc->setOperand(0, Result->getVPSingleValue()); + if (PhiR->getOperand(1) == NewExitingVPV) + PhiR->setOperand(1, Extnd->getVPSingleValue()); + NewExitingVPV = Extnd; } + + // We want code in the middle block to appear to execute on the location of + // the scalar loop's latch terminator because: (a) it is all compiler + // generated, (b) these instructions are always executed after evaluating + // the latch conditional branch, and (c) other passes may add new + // predecessors which terminate on this line. This is the easiest way to + // ensure we don't accidentally cause an extra step back into the loop while + // debugging. + DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); + + // TODO: At the moment ComputeReductionResult also drives creation of the + // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here + // even for in-loop reductions, until the reduction resume value handling is + // also modeled in VPlan. + auto *FinalReductionResult = new VPInstruction( + VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) + ->appendRecipe(FinalReductionResult); + OrigExitingVPV->replaceUsesWithIf( + FinalReductionResult, + [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); } VPlanTransforms::clearReductionWrapFlags(*Plan); @@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - BestMainPlan, MainILV, DT, true); + const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( + EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { - ResumeV = MainILV.getReductionResumeValue( - ReductionPhi->getRecurrenceDescriptor()); + ResumeV = ReductionResumeValues + .find(&ReductionPhi->getRecurrenceDescriptor()) + ->second; } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction |