diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 484 | 
1 files changed, 430 insertions, 54 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 83bfdfd09d19..33132880d5a4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,17 +12,22 @@  //===----------------------------------------------------------------------===//  #include "VPlanTransforms.h" -#include "VPlanDominatorTree.h"  #include "VPRecipeBuilder.h" +#include "VPlanAnalysis.h"  #include "VPlanCFG.h" +#include "VPlanDominatorTree.h"  #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/Analysis/IVDescriptors.h"  #include "llvm/Analysis/VectorUtils.h"  #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h"  using namespace llvm; +using namespace llvm::PatternMatch; +  void VPlanTransforms::VPInstructionsToVPRecipes(      VPlanPtr &Plan,      function_ref<const InductionDescriptor *(PHINode *)> @@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes(            NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());          } else if (auto *CI = dyn_cast<CastInst>(Inst)) {            NewRecipe = new VPWidenCastRecipe( -              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI); +              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);          } else {            NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());          } @@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) {        // TODO: add ".cloned" suffix to name of Clone's VPValue.        Clone->insertBefore(SinkCandidate); -      for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) { -        auto *UI = cast<VPRecipeBase>(U); -        if (UI->getParent() == SinkTo) -          continue; - -        for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) { -          if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue()) -            continue; -          UI->setOperand(Idx, Clone); -        } -      } +      SinkCandidate->getVPSingleValue()->replaceUsesWithIf( +          Clone, [SinkTo](VPUser &U, unsigned) { +            return cast<VPRecipeBase>(&U)->getParent() != SinkTo; +          });      }      SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());      for (VPValue *Op : SinkCandidate->operands()) @@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {        VPValue *PredInst1 =            cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);        VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); -      for (VPUser *U : to_vector(Phi1ToMoveV->users())) { -        auto *UI = dyn_cast<VPRecipeBase>(U); -        if (!UI || UI->getParent() != Then2) -          continue; -        for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) { -          if (Phi1ToMoveV != U->getOperand(I)) -            continue; -          U->setOperand(I, PredInst1); -        } -      } +      Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { +        auto *UI = dyn_cast<VPRecipeBase>(&U); +        return UI && UI->getParent() == Then2; +      });        Phi1ToMove.moveBefore(*Merge2, Merge2->begin());      } @@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {      // The recipes in the block are processed in reverse order, to catch chains      // of dead recipes.      for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { -      if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) { -            return V->getNumUsers() > 0; -          })) +      // A user keeps R alive: +      if (any_of(R.definedValues(), +                 [](VPValue *V) { return V->getNumUsers(); })) +        continue; + +      // Having side effects keeps R alive, but do remove conditional assume +      // instructions as their conditions may be flattened. +      auto *RepR = dyn_cast<VPReplicateRecipe>(&R); +      bool IsConditionalAssume = +          RepR && RepR->isPredicated() && +          match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); +      if (R.mayHaveSideEffects() && !IsConditionalAssume)          continue; +        R.eraseFromParent();      }    }  } +static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, +                                    ScalarEvolution &SE, Instruction *TruncI, +                                    Type *IVTy, VPValue *StartV, +                                    VPValue *Step) { +  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); +  auto IP = HeaderVPBB->getFirstNonPhi(); +  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); +  Type *TruncTy = TruncI ? TruncI->getType() : IVTy; +  VPValue *BaseIV = CanonicalIV; +  if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) { +    BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step, +                                   TruncI ? TruncI->getType() : nullptr); +    HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); +  } + +  VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); +  HeaderVPBB->insert(Steps, IP); +  return Steps; +} +  void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {    SmallVector<VPRecipeBase *> ToRemove;    VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); @@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {          }))        continue; -    auto IP = HeaderVPBB->getFirstNonPhi(); -    VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); -    Type *ResultTy = WideIV->getPHINode()->getType(); -    if (Instruction *TruncI = WideIV->getTruncInst()) -      ResultTy = TruncI->getType();      const InductionDescriptor &ID = WideIV->getInductionDescriptor(); -    VPValue *Step = WideIV->getStepValue(); -    VPValue *BaseIV = CanonicalIV; -    if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step, -                                  ResultTy)) { -      BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV, -                                     Step, ResultTy); -      HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); -    } - -    VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); -    HeaderVPBB->insert(Steps, IP); +    VPValue *Steps = createScalarIVSteps( +        Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(), +        WideIV->getStartValue(), WideIV->getStepValue()); -    // Update scalar users of IV to use Step instead. Use SetVector to ensure -    // the list of users doesn't contain duplicates. -    SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end()); -    for (VPUser *U : Users) { -      if (HasOnlyVectorVFs && !U->usesScalars(WideIV)) -        continue; -      for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) { -        if (U->getOperand(I) != WideIV) -          continue; -        U->setOperand(I, Steps); -      } -    } +    // Update scalar users of IV to use Step instead. +    if (!HasOnlyVectorVFs) +      WideIV->replaceAllUsesWith(Steps); +    else +      WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { +        return U.usesScalars(WideIV); +      });    }  } @@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {      }    }  } + +/// Returns true is \p V is constant one. +static bool isConstantOne(VPValue *V) { +  if (!V->isLiveIn()) +    return false; +  auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue()); +  return C && C->isOne(); +} + +/// Returns the llvm::Instruction opcode for \p R. +static unsigned getOpcodeForRecipe(VPRecipeBase &R) { +  if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R)) +    return WidenR->getUnderlyingInstr()->getOpcode(); +  if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R)) +    return WidenC->getOpcode(); +  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) +    return RepR->getUnderlyingInstr()->getOpcode(); +  if (auto *VPI = dyn_cast<VPInstruction>(&R)) +    return VPI->getOpcode(); +  return 0; +} + +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { +  switch (getOpcodeForRecipe(R)) { +  case Instruction::Mul: { +    VPValue *A = R.getOperand(0); +    VPValue *B = R.getOperand(1); +    if (isConstantOne(A)) +      return R.getVPSingleValue()->replaceAllUsesWith(B); +    if (isConstantOne(B)) +      return R.getVPSingleValue()->replaceAllUsesWith(A); +    break; +  } +  case Instruction::Trunc: { +    VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe(); +    if (!Ext) +      break; +    unsigned ExtOpcode = getOpcodeForRecipe(*Ext); +    if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) +      break; +    VPValue *A = Ext->getOperand(0); +    VPValue *Trunc = R.getVPSingleValue(); +    Type *TruncTy = TypeInfo.inferScalarType(Trunc); +    Type *ATy = TypeInfo.inferScalarType(A); +    if (TruncTy == ATy) { +      Trunc->replaceAllUsesWith(A); +    } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { +      auto *VPC = +          new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); +      VPC->insertBefore(&R); +      Trunc->replaceAllUsesWith(VPC); +    } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { +      auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); +      VPC->insertBefore(&R); +      Trunc->replaceAllUsesWith(VPC); +    } +#ifndef NDEBUG +    // Verify that the cached type info is for both A and its users is still +    // accurate by comparing it to freshly computed types. +    VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); +    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); +    for (VPUser *U : A->users()) { +      auto *R = dyn_cast<VPRecipeBase>(U); +      if (!R) +        continue; +      for (VPValue *VPV : R->definedValues()) +        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); +    } +#endif +    break; +  } +  default: +    break; +  } +} + +/// Try to simplify the recipes in \p Plan. +static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { +  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( +      Plan.getEntry()); +  VPTypeAnalysis TypeInfo(Ctx); +  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { +    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { +      simplifyRecipe(R, TypeInfo); +    } +  } +} + +void VPlanTransforms::truncateToMinimalBitwidths( +    VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, +    LLVMContext &Ctx) { +#ifndef NDEBUG +  // Count the processed recipes and cross check the count later with MinBWs +  // size, to make sure all entries in MinBWs have been handled. +  unsigned NumProcessedRecipes = 0; +#endif +  // Keep track of created truncates, so they can be re-used. Note that we +  // cannot use RAUW after creating a new truncate, as this would could make +  // other uses have different types for their operands, making them invalidly +  // typed. +  DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; +  VPTypeAnalysis TypeInfo(Ctx); +  VPBasicBlock *PH = Plan.getEntry(); +  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( +           vp_depth_first_deep(Plan.getVectorLoopRegion()))) { +    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { +      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, +               VPWidenSelectRecipe>(&R)) +        continue; + +      VPValue *ResultVPV = R.getVPSingleValue(); +      auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue()); +      unsigned NewResSizeInBits = MinBWs.lookup(UI); +      if (!NewResSizeInBits) +        continue; + +#ifndef NDEBUG +      NumProcessedRecipes++; +#endif +      // If the value wasn't vectorized, we must maintain the original scalar +      // type. Skip those here, after incrementing NumProcessedRecipes. Also +      // skip casts which do not need to be handled explicitly here, as +      // redundant casts will be removed during recipe simplification. +      if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) { +#ifndef NDEBUG +        // If any of the operands is a live-in and not used by VPWidenRecipe or +        // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as +        // processed as well. When MinBWs is currently constructed, there is no +        // information about whether recipes are widened or replicated and in +        // case they are reciplicated the operands are not truncated. Counting +        // them them here ensures we do not miss any recipes in MinBWs. +        // TODO: Remove once the analysis is done on VPlan. +        for (VPValue *Op : R.operands()) { +          if (!Op->isLiveIn()) +            continue; +          auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); +          if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && +              all_of(Op->users(), [](VPUser *U) { +                return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); +              })) { +            // Add an entry to ProcessedTruncs to avoid counting the same +            // operand multiple times. +            ProcessedTruncs[Op] = nullptr; +            NumProcessedRecipes += 1; +          } +        } +#endif +        continue; +      } + +      Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); +      unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); +      assert(OldResTy->isIntegerTy() && "only integer types supported"); +      if (OldResSizeInBits == NewResSizeInBits) +        continue; +      assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); +      (void)OldResSizeInBits; + +      auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); + +      // Shrink operands by introducing truncates as needed. +      unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0; +      for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { +        auto *Op = R.getOperand(Idx); +        unsigned OpSizeInBits = +            TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); +        if (OpSizeInBits == NewResSizeInBits) +          continue; +        assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); +        auto [ProcessedIter, IterIsEmpty] = +            ProcessedTruncs.insert({Op, nullptr}); +        VPWidenCastRecipe *NewOp = +            IterIsEmpty +                ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) +                : ProcessedIter->second; +        R.setOperand(Idx, NewOp); +        if (!IterIsEmpty) +          continue; +        ProcessedIter->second = NewOp; +        if (!Op->isLiveIn()) { +          NewOp->insertBefore(&R); +        } else { +          PH->appendRecipe(NewOp); +#ifndef NDEBUG +          auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); +          bool IsContained = MinBWs.contains(OpInst); +          NumProcessedRecipes += IsContained; +#endif +        } +      } + +      // Any wrapping introduced by shrinking this operation shouldn't be +      // considered undefined behavior. So, we can't unconditionally copy +      // arithmetic wrapping flags to VPW. +      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) +        VPW->dropPoisonGeneratingFlags(); + +      // Extend result to original width. +      auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); +      Ext->insertAfter(&R); +      ResultVPV->replaceAllUsesWith(Ext); +      Ext->setOperand(0, ResultVPV); +    } +  } + +  assert(MinBWs.size() == NumProcessedRecipes && +         "some entries in MinBWs haven't been processed"); +} + +void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { +  removeRedundantCanonicalIVs(Plan); +  removeRedundantInductionCasts(Plan); + +  optimizeInductions(Plan, SE); +  simplifyRecipes(Plan, SE.getContext()); +  removeDeadRecipes(Plan); + +  createAndOptimizeReplicateRegions(Plan); + +  removeRedundantExpandSCEVRecipes(Plan); +  mergeBlocksIntoPredecessors(Plan); +} + +// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace +// the loop terminator with a branch-on-cond recipe with the negated +// active-lane-mask as operand. Note that this turns the loop into an +// uncountable one. Only the existing terminator is replaced, all other existing +// recipes/users remain unchanged, except for poison-generating flags being +// dropped from the canonical IV increment. Return the created +// VPActiveLaneMaskPHIRecipe. +// +// The function uses the following definitions: +// +//  %TripCount = DataWithControlFlowWithoutRuntimeCheck ? +//    calculate-trip-count-minus-VF (original TC) : original TC +//  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? +//     CanonicalIVPhi : CanonicalIVIncrement +//  %StartV is the canonical induction start value. +// +// The function adds the following recipes: +// +// vector.ph: +//   %TripCount = calculate-trip-count-minus-VF (original TC) +//       [if DataWithControlFlowWithoutRuntimeCheck] +//   %EntryInc = canonical-iv-increment-for-part %StartV +//   %EntryALM = active-lane-mask %EntryInc, %TripCount +// +// vector.body: +//   ... +//   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] +//   ... +//   %InLoopInc = canonical-iv-increment-for-part %IncrementValue +//   %ALM = active-lane-mask %InLoopInc, TripCount +//   %Negated = Not %ALM +//   branch-on-cond %Negated +// +static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( +    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { +  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); +  VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); +  auto *CanonicalIVPHI = Plan.getCanonicalIV(); +  VPValue *StartV = CanonicalIVPHI->getStartValue(); + +  auto *CanonicalIVIncrement = +      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); +  // TODO: Check if dropping the flags is needed if +  // !DataAndControlFlowWithoutRuntimeCheck. +  CanonicalIVIncrement->dropPoisonGeneratingFlags(); +  DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); +  // We can't use StartV directly in the ActiveLaneMask VPInstruction, since +  // we have to take unrolling into account. Each part needs to start at +  //   Part * VF +  auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor()); +  VPBuilder Builder(VecPreheader); + +  // Create the ActiveLaneMask instruction using the correct start values. +  VPValue *TC = Plan.getTripCount(); + +  VPValue *TripCount, *IncrementValue; +  if (!DataAndControlFlowWithoutRuntimeCheck) { +    // When the loop is guarded by a runtime overflow check for the loop +    // induction variable increment by VF, we can increment the value before +    // the get.active.lane mask and use the unmodified tripcount. +    IncrementValue = CanonicalIVIncrement; +    TripCount = TC; +  } else { +    // When avoiding a runtime check, the active.lane.mask inside the loop +    // uses a modified trip count and the induction variable increment is +    // done after the active.lane.mask intrinsic is called. +    IncrementValue = CanonicalIVPHI; +    TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, +                                     {TC}, DL); +  } +  auto *EntryIncrement = Builder.createOverflowingOp( +      VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, +      "index.part.next"); + +  // Create the active lane mask instruction in the VPlan preheader. +  auto *EntryALM = +      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, +                           DL, "active.lane.mask.entry"); + +  // Now create the ActiveLaneMaskPhi recipe in the main loop using the +  // preheader ActiveLaneMask instruction. +  auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); +  LaneMaskPhi->insertAfter(CanonicalIVPHI); + +  // Create the active lane mask for the next iteration of the loop before the +  // original terminator. +  VPRecipeBase *OriginalTerminator = EB->getTerminator(); +  Builder.setInsertPoint(OriginalTerminator); +  auto *InLoopIncrement = +      Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, +                                  {IncrementValue}, {false, false}, DL); +  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, +                                   {InLoopIncrement, TripCount}, DL, +                                   "active.lane.mask.next"); +  LaneMaskPhi->addOperand(ALM); + +  // Replace the original terminator with BranchOnCond. We have to invert the +  // mask here because a true condition means jumping to the exit block. +  auto *NotMask = Builder.createNot(ALM, DL); +  Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); +  OriginalTerminator->eraseFromParent(); +  return LaneMaskPhi; +} + +void VPlanTransforms::addActiveLaneMask( +    VPlan &Plan, bool UseActiveLaneMaskForControlFlow, +    bool DataAndControlFlowWithoutRuntimeCheck) { +  assert((!DataAndControlFlowWithoutRuntimeCheck || +          UseActiveLaneMaskForControlFlow) && +         "DataAndControlFlowWithoutRuntimeCheck implies " +         "UseActiveLaneMaskForControlFlow"); + +  auto FoundWidenCanonicalIVUser = +      find_if(Plan.getCanonicalIV()->users(), +              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); +  assert(FoundWidenCanonicalIVUser && +         "Must have widened canonical IV when tail folding!"); +  auto *WideCanonicalIV = +      cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); +  VPRecipeBase *LaneMask; +  if (UseActiveLaneMaskForControlFlow) { +    LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( +        Plan, DataAndControlFlowWithoutRuntimeCheck); +  } else { +    LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, +                                 {WideCanonicalIV, Plan.getTripCount()}, +                                 nullptr, "active.lane.mask"); +    LaneMask->insertAfter(WideCanonicalIV); +  } + +  // Walk users of WideCanonicalIV and replace all compares of the form +  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an +  // active-lane-mask. +  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); +  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { +    auto *CompareToReplace = dyn_cast<VPInstruction>(U); +    if (!CompareToReplace || +        CompareToReplace->getOpcode() != Instruction::ICmp || +        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || +        CompareToReplace->getOperand(1) != BTC) +      continue; + +    assert(CompareToReplace->getOperand(0) == WideCanonicalIV && +           "WidenCanonicalIV must be the first operand of the compare"); +    CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); +    CompareToReplace->eraseFromParent(); +  } +}  | 
