diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
| commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
| tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | |
| parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
| parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 484 |
1 files changed, 430 insertions, 54 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 83bfdfd09d19..33132880d5a4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,17 +12,22 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" -#include "VPlanDominatorTree.h" #include "VPRecipeBuilder.h" +#include "VPlanAnalysis.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; +using namespace llvm::PatternMatch; + void VPlanTransforms::VPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes( NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); } else if (auto *CI = dyn_cast<CastInst>(Inst)) { NewRecipe = new VPWidenCastRecipe( - CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI); + CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI); } else { NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); } @@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) { // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); - for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - continue; - - for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) { - if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue()) - continue; - UI->setOperand(Idx, Clone); - } - } + SinkCandidate->getVPSingleValue()->replaceUsesWithIf( + Clone, [SinkTo](VPUser &U, unsigned) { + return cast<VPRecipeBase>(&U)->getParent() != SinkTo; + }); } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) @@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { VPValue *PredInst1 = cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); - for (VPUser *U : to_vector(Phi1ToMoveV->users())) { - auto *UI = dyn_cast<VPRecipeBase>(U); - if (!UI || UI->getParent() != Then2) - continue; - for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) { - if (Phi1ToMoveV != U->getOperand(I)) - continue; - U->setOperand(I, PredInst1); - } - } + Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { + auto *UI = dyn_cast<VPRecipeBase>(&U); + return UI && UI->getParent() == Then2; + }); Phi1ToMove.moveBefore(*Merge2, Merge2->begin()); } @@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) { - return V->getNumUsers() > 0; - })) + // A user keeps R alive: + if (any_of(R.definedValues(), + [](VPValue *V) { return V->getNumUsers(); })) + continue; + + // Having side effects keeps R alive, but do remove conditional assume + // instructions as their conditions may be flattened. + auto *RepR = dyn_cast<VPReplicateRecipe>(&R); + bool IsConditionalAssume = + RepR && RepR->isPredicated() && + match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); + if (R.mayHaveSideEffects() && !IsConditionalAssume) continue; + R.eraseFromParent(); } } } +static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, + ScalarEvolution &SE, Instruction *TruncI, + Type *IVTy, VPValue *StartV, + VPValue *Step) { + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto IP = HeaderVPBB->getFirstNonPhi(); + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + Type *TruncTy = TruncI ? TruncI->getType() : IVTy; + VPValue *BaseIV = CanonicalIV; + if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) { + BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step, + TruncI ? TruncI->getType() : nullptr); + HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); + } + + VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); + HeaderVPBB->insert(Steps, IP); + return Steps; +} + void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { SmallVector<VPRecipeBase *> ToRemove; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); @@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { })) continue; - auto IP = HeaderVPBB->getFirstNonPhi(); - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - Type *ResultTy = WideIV->getPHINode()->getType(); - if (Instruction *TruncI = WideIV->getTruncInst()) - ResultTy = TruncI->getType(); const InductionDescriptor &ID = WideIV->getInductionDescriptor(); - VPValue *Step = WideIV->getStepValue(); - VPValue *BaseIV = CanonicalIV; - if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step, - ResultTy)) { - BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV, - Step, ResultTy); - HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); - } - - VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); - HeaderVPBB->insert(Steps, IP); + VPValue *Steps = createScalarIVSteps( + Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(), + WideIV->getStartValue(), WideIV->getStepValue()); - // Update scalar users of IV to use Step instead. Use SetVector to ensure - // the list of users doesn't contain duplicates. - SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end()); - for (VPUser *U : Users) { - if (HasOnlyVectorVFs && !U->usesScalars(WideIV)) - continue; - for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) { - if (U->getOperand(I) != WideIV) - continue; - U->setOperand(I, Steps); - } - } + // Update scalar users of IV to use Step instead. + if (!HasOnlyVectorVFs) + WideIV->replaceAllUsesWith(Steps); + else + WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { + return U.usesScalars(WideIV); + }); } } @@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } } + +/// Returns true is \p V is constant one. +static bool isConstantOne(VPValue *V) { + if (!V->isLiveIn()) + return false; + auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue()); + return C && C->isOne(); +} + +/// Returns the llvm::Instruction opcode for \p R. +static unsigned getOpcodeForRecipe(VPRecipeBase &R) { + if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R)) + return WidenR->getUnderlyingInstr()->getOpcode(); + if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R)) + return WidenC->getOpcode(); + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) + return RepR->getUnderlyingInstr()->getOpcode(); + if (auto *VPI = dyn_cast<VPInstruction>(&R)) + return VPI->getOpcode(); + return 0; +} + +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { + switch (getOpcodeForRecipe(R)) { + case Instruction::Mul: { + VPValue *A = R.getOperand(0); + VPValue *B = R.getOperand(1); + if (isConstantOne(A)) + return R.getVPSingleValue()->replaceAllUsesWith(B); + if (isConstantOne(B)) + return R.getVPSingleValue()->replaceAllUsesWith(A); + break; + } + case Instruction::Trunc: { + VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe(); + if (!Ext) + break; + unsigned ExtOpcode = getOpcodeForRecipe(*Ext); + if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) + break; + VPValue *A = Ext->getOperand(0); + VPValue *Trunc = R.getVPSingleValue(); + Type *TruncTy = TypeInfo.inferScalarType(Trunc); + Type *ATy = TypeInfo.inferScalarType(A); + if (TruncTy == ATy) { + Trunc->replaceAllUsesWith(A); + } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } +#ifndef NDEBUG + // Verify that the cached type info is for both A and its users is still + // accurate by comparing it to freshly computed types. + VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); + assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); + for (VPUser *U : A->users()) { + auto *R = dyn_cast<VPRecipeBase>(U); + if (!R) + continue; + for (VPValue *VPV : R->definedValues()) + assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); + } +#endif + break; + } + default: + break; + } +} + +/// Try to simplify the recipes in \p Plan. +static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { + ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( + Plan.getEntry()); + VPTypeAnalysis TypeInfo(Ctx); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + simplifyRecipe(R, TypeInfo); + } + } +} + +void VPlanTransforms::truncateToMinimalBitwidths( + VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, + LLVMContext &Ctx) { +#ifndef NDEBUG + // Count the processed recipes and cross check the count later with MinBWs + // size, to make sure all entries in MinBWs have been handled. + unsigned NumProcessedRecipes = 0; +#endif + // Keep track of created truncates, so they can be re-used. Note that we + // cannot use RAUW after creating a new truncate, as this would could make + // other uses have different types for their operands, making them invalidly + // typed. + DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; + VPTypeAnalysis TypeInfo(Ctx); + VPBasicBlock *PH = Plan.getEntry(); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, + VPWidenSelectRecipe>(&R)) + continue; + + VPValue *ResultVPV = R.getVPSingleValue(); + auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue()); + unsigned NewResSizeInBits = MinBWs.lookup(UI); + if (!NewResSizeInBits) + continue; + +#ifndef NDEBUG + NumProcessedRecipes++; +#endif + // If the value wasn't vectorized, we must maintain the original scalar + // type. Skip those here, after incrementing NumProcessedRecipes. Also + // skip casts which do not need to be handled explicitly here, as + // redundant casts will be removed during recipe simplification. + if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) { +#ifndef NDEBUG + // If any of the operands is a live-in and not used by VPWidenRecipe or + // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as + // processed as well. When MinBWs is currently constructed, there is no + // information about whether recipes are widened or replicated and in + // case they are reciplicated the operands are not truncated. Counting + // them them here ensures we do not miss any recipes in MinBWs. + // TODO: Remove once the analysis is done on VPlan. + for (VPValue *Op : R.operands()) { + if (!Op->isLiveIn()) + continue; + auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); + if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && + all_of(Op->users(), [](VPUser *U) { + return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); + })) { + // Add an entry to ProcessedTruncs to avoid counting the same + // operand multiple times. + ProcessedTruncs[Op] = nullptr; + NumProcessedRecipes += 1; + } + } +#endif + continue; + } + + Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); + unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); + assert(OldResTy->isIntegerTy() && "only integer types supported"); + if (OldResSizeInBits == NewResSizeInBits) + continue; + assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); + (void)OldResSizeInBits; + + auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + + // Shrink operands by introducing truncates as needed. + unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0; + for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { + auto *Op = R.getOperand(Idx); + unsigned OpSizeInBits = + TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); + if (OpSizeInBits == NewResSizeInBits) + continue; + assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); + auto [ProcessedIter, IterIsEmpty] = + ProcessedTruncs.insert({Op, nullptr}); + VPWidenCastRecipe *NewOp = + IterIsEmpty + ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) + : ProcessedIter->second; + R.setOperand(Idx, NewOp); + if (!IterIsEmpty) + continue; + ProcessedIter->second = NewOp; + if (!Op->isLiveIn()) { + NewOp->insertBefore(&R); + } else { + PH->appendRecipe(NewOp); +#ifndef NDEBUG + auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); + bool IsContained = MinBWs.contains(OpInst); + NumProcessedRecipes += IsContained; +#endif + } + } + + // Any wrapping introduced by shrinking this operation shouldn't be + // considered undefined behavior. So, we can't unconditionally copy + // arithmetic wrapping flags to VPW. + if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) + VPW->dropPoisonGeneratingFlags(); + + // Extend result to original width. + auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); + Ext->insertAfter(&R); + ResultVPV->replaceAllUsesWith(Ext); + Ext->setOperand(0, ResultVPV); + } + } + + assert(MinBWs.size() == NumProcessedRecipes && + "some entries in MinBWs haven't been processed"); +} + +void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { + removeRedundantCanonicalIVs(Plan); + removeRedundantInductionCasts(Plan); + + optimizeInductions(Plan, SE); + simplifyRecipes(Plan, SE.getContext()); + removeDeadRecipes(Plan); + + createAndOptimizeReplicateRegions(Plan); + + removeRedundantExpandSCEVRecipes(Plan); + mergeBlocksIntoPredecessors(Plan); +} + +// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace +// the loop terminator with a branch-on-cond recipe with the negated +// active-lane-mask as operand. Note that this turns the loop into an +// uncountable one. Only the existing terminator is replaced, all other existing +// recipes/users remain unchanged, except for poison-generating flags being +// dropped from the canonical IV increment. Return the created +// VPActiveLaneMaskPHIRecipe. +// +// The function uses the following definitions: +// +// %TripCount = DataWithControlFlowWithoutRuntimeCheck ? +// calculate-trip-count-minus-VF (original TC) : original TC +// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? +// CanonicalIVPhi : CanonicalIVIncrement +// %StartV is the canonical induction start value. +// +// The function adds the following recipes: +// +// vector.ph: +// %TripCount = calculate-trip-count-minus-VF (original TC) +// [if DataWithControlFlowWithoutRuntimeCheck] +// %EntryInc = canonical-iv-increment-for-part %StartV +// %EntryALM = active-lane-mask %EntryInc, %TripCount +// +// vector.body: +// ... +// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] +// ... +// %InLoopInc = canonical-iv-increment-for-part %IncrementValue +// %ALM = active-lane-mask %InLoopInc, TripCount +// %Negated = Not %ALM +// branch-on-cond %Negated +// +static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( + VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); + auto *CanonicalIVPHI = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIVPHI->getStartValue(); + + auto *CanonicalIVIncrement = + cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); + // TODO: Check if dropping the flags is needed if + // !DataAndControlFlowWithoutRuntimeCheck. + CanonicalIVIncrement->dropPoisonGeneratingFlags(); + DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor()); + VPBuilder Builder(VecPreheader); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getTripCount(); + + VPValue *TripCount, *IncrementValue; + if (!DataAndControlFlowWithoutRuntimeCheck) { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } else { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + IncrementValue = CanonicalIVPHI; + TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, + {TC}, DL); + } + auto *EntryIncrement = Builder.createOverflowingOp( + VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, + "index.part.next"); + + // Create the active lane mask instruction in the VPlan preheader. + auto *EntryALM = + Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, + DL, "active.lane.mask.entry"); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + LaneMaskPhi->insertAfter(CanonicalIVPHI); + + // Create the active lane mask for the next iteration of the loop before the + // original terminator. + VPRecipeBase *OriginalTerminator = EB->getTerminator(); + Builder.setInsertPoint(OriginalTerminator); + auto *InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {IncrementValue}, {false, false}, DL); + auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {InLoopIncrement, TripCount}, DL, + "active.lane.mask.next"); + LaneMaskPhi->addOperand(ALM); + + // Replace the original terminator with BranchOnCond. We have to invert the + // mask here because a true condition means jumping to the exit block. + auto *NotMask = Builder.createNot(ALM, DL); + Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); + OriginalTerminator->eraseFromParent(); + return LaneMaskPhi; +} + +void VPlanTransforms::addActiveLaneMask( + VPlan &Plan, bool UseActiveLaneMaskForControlFlow, + bool DataAndControlFlowWithoutRuntimeCheck) { + assert((!DataAndControlFlowWithoutRuntimeCheck || + UseActiveLaneMaskForControlFlow) && + "DataAndControlFlowWithoutRuntimeCheck implies " + "UseActiveLaneMaskForControlFlow"); + + auto FoundWidenCanonicalIVUser = + find_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + assert(FoundWidenCanonicalIVUser && + "Must have widened canonical IV when tail folding!"); + auto *WideCanonicalIV = + cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); + VPRecipeBase *LaneMask; + if (UseActiveLaneMaskForControlFlow) { + LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( + Plan, DataAndControlFlowWithoutRuntimeCheck); + } else { + LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount()}, + nullptr, "active.lane.mask"); + LaneMask->insertAfter(WideCanonicalIV); + } + + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an + // active-lane-mask. + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { + auto *CompareToReplace = dyn_cast<VPInstruction>(U); + if (!CompareToReplace || + CompareToReplace->getOpcode() != Instruction::ICmp || + CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || + CompareToReplace->getOperand(1) != BTC) + continue; + + assert(CompareToReplace->getOperand(0) == WideCanonicalIV && + "WidenCanonicalIV must be the first operand of the compare"); + CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); + CompareToReplace->eraseFromParent(); + } +} |
