aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-18 20:30:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-04-06 20:11:55 +0000
commit5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
parent3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp484
1 files changed, 430 insertions, 54 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 83bfdfd09d19..33132880d5a4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,17 +12,22 @@
//===----------------------------------------------------------------------===//
#include "VPlanTransforms.h"
-#include "VPlanDominatorTree.h"
#include "VPRecipeBuilder.h"
+#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
using namespace llvm;
+using namespace llvm::PatternMatch;
+
void VPlanTransforms::VPInstructionsToVPRecipes(
VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
@@ -76,7 +81,7 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
} else if (auto *CI = dyn_cast<CastInst>(Inst)) {
NewRecipe = new VPWidenCastRecipe(
- CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI);
+ CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
} else {
NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
}
@@ -158,17 +163,10 @@ static bool sinkScalarOperands(VPlan &Plan) {
// TODO: add ".cloned" suffix to name of Clone's VPValue.
Clone->insertBefore(SinkCandidate);
- for (auto *U : to_vector(SinkCandidate->getVPSingleValue()->users())) {
- auto *UI = cast<VPRecipeBase>(U);
- if (UI->getParent() == SinkTo)
- continue;
-
- for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) {
- if (UI->getOperand(Idx) != SinkCandidate->getVPSingleValue())
- continue;
- UI->setOperand(Idx, Clone);
- }
- }
+ SinkCandidate->getVPSingleValue()->replaceUsesWithIf(
+ Clone, [SinkTo](VPUser &U, unsigned) {
+ return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
+ });
}
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
for (VPValue *Op : SinkCandidate->operands())
@@ -273,16 +271,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
VPValue *PredInst1 =
cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
- for (VPUser *U : to_vector(Phi1ToMoveV->users())) {
- auto *UI = dyn_cast<VPRecipeBase>(U);
- if (!UI || UI->getParent() != Then2)
- continue;
- for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {
- if (Phi1ToMoveV != U->getOperand(I))
- continue;
- U->setOperand(I, PredInst1);
- }
- }
+ Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
+ auto *UI = dyn_cast<VPRecipeBase>(&U);
+ return UI && UI->getParent() == Then2;
+ });
Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
}
@@ -479,15 +471,45 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (R.mayHaveSideEffects() || any_of(R.definedValues(), [](VPValue *V) {
- return V->getNumUsers() > 0;
- }))
+ // A user keeps R alive:
+ if (any_of(R.definedValues(),
+ [](VPValue *V) { return V->getNumUsers(); }))
+ continue;
+
+ // Having side effects keeps R alive, but do remove conditional assume
+ // instructions as their conditions may be flattened.
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ bool IsConditionalAssume =
+ RepR && RepR->isPredicated() &&
+ match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
+ if (R.mayHaveSideEffects() && !IsConditionalAssume)
continue;
+
R.eraseFromParent();
}
}
}
+static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
+ ScalarEvolution &SE, Instruction *TruncI,
+ Type *IVTy, VPValue *StartV,
+ VPValue *Step) {
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto IP = HeaderVPBB->getFirstNonPhi();
+ VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ Type *TruncTy = TruncI ? TruncI->getType() : IVTy;
+ VPValue *BaseIV = CanonicalIV;
+ if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) {
+ BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step,
+ TruncI ? TruncI->getType() : nullptr);
+ HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
+ }
+
+ VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
+ HeaderVPBB->insert(Steps, IP);
+ return Steps;
+}
+
void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
SmallVector<VPRecipeBase *> ToRemove;
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
@@ -501,36 +523,18 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
}))
continue;
- auto IP = HeaderVPBB->getFirstNonPhi();
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
- Type *ResultTy = WideIV->getPHINode()->getType();
- if (Instruction *TruncI = WideIV->getTruncInst())
- ResultTy = TruncI->getType();
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *Step = WideIV->getStepValue();
- VPValue *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(ID.getKind(), WideIV->getStartValue(), Step,
- ResultTy)) {
- BaseIV = new VPDerivedIVRecipe(ID, WideIV->getStartValue(), CanonicalIV,
- Step, ResultTy);
- HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
- }
-
- VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
- HeaderVPBB->insert(Steps, IP);
+ VPValue *Steps = createScalarIVSteps(
+ Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(),
+ WideIV->getStartValue(), WideIV->getStepValue());
- // Update scalar users of IV to use Step instead. Use SetVector to ensure
- // the list of users doesn't contain duplicates.
- SetVector<VPUser *> Users(WideIV->user_begin(), WideIV->user_end());
- for (VPUser *U : Users) {
- if (HasOnlyVectorVFs && !U->usesScalars(WideIV))
- continue;
- for (unsigned I = 0, E = U->getNumOperands(); I != E; I++) {
- if (U->getOperand(I) != WideIV)
- continue;
- U->setOperand(I, Steps);
- }
- }
+ // Update scalar users of IV to use Step instead.
+ if (!HasOnlyVectorVFs)
+ WideIV->replaceAllUsesWith(Steps);
+ else
+ WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
+ return U.usesScalars(WideIV);
+ });
}
}
@@ -778,3 +782,375 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}
}
+
+/// Returns true is \p V is constant one.
+static bool isConstantOne(VPValue *V) {
+ if (!V->isLiveIn())
+ return false;
+ auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
+ return C && C->isOne();
+}
+
+/// Returns the llvm::Instruction opcode for \p R.
+static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
+ if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
+ return WidenR->getUnderlyingInstr()->getOpcode();
+ if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))
+ return WidenC->getOpcode();
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
+ return RepR->getUnderlyingInstr()->getOpcode();
+ if (auto *VPI = dyn_cast<VPInstruction>(&R))
+ return VPI->getOpcode();
+ return 0;
+}
+
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+ switch (getOpcodeForRecipe(R)) {
+ case Instruction::Mul: {
+ VPValue *A = R.getOperand(0);
+ VPValue *B = R.getOperand(1);
+ if (isConstantOne(A))
+ return R.getVPSingleValue()->replaceAllUsesWith(B);
+ if (isConstantOne(B))
+ return R.getVPSingleValue()->replaceAllUsesWith(A);
+ break;
+ }
+ case Instruction::Trunc: {
+ VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();
+ if (!Ext)
+ break;
+ unsigned ExtOpcode = getOpcodeForRecipe(*Ext);
+ if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+ break;
+ VPValue *A = Ext->getOperand(0);
+ VPValue *Trunc = R.getVPSingleValue();
+ Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+ Type *ATy = TypeInfo.inferScalarType(A);
+ if (TruncTy == ATy) {
+ Trunc->replaceAllUsesWith(A);
+ } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
+#ifndef NDEBUG
+ // Verify that the cached type info is for both A and its users is still
+ // accurate by comparing it to freshly computed types.
+ VPTypeAnalysis TypeInfo2(TypeInfo.getContext());
+ assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+ for (VPUser *U : A->users()) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ if (!R)
+ continue;
+ for (VPValue *VPV : R->definedValues())
+ assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+ }
+#endif
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/// Try to simplify the recipes in \p Plan.
+static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ VPTypeAnalysis TypeInfo(Ctx);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ simplifyRecipe(R, TypeInfo);
+ }
+ }
+}
+
+void VPlanTransforms::truncateToMinimalBitwidths(
+ VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,
+ LLVMContext &Ctx) {
+#ifndef NDEBUG
+ // Count the processed recipes and cross check the count later with MinBWs
+ // size, to make sure all entries in MinBWs have been handled.
+ unsigned NumProcessedRecipes = 0;
+#endif
+ // Keep track of created truncates, so they can be re-used. Note that we
+ // cannot use RAUW after creating a new truncate, as this would could make
+ // other uses have different types for their operands, making them invalidly
+ // typed.
+ DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
+ VPTypeAnalysis TypeInfo(Ctx);
+ VPBasicBlock *PH = Plan.getEntry();
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
+ VPWidenSelectRecipe>(&R))
+ continue;
+
+ VPValue *ResultVPV = R.getVPSingleValue();
+ auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
+ unsigned NewResSizeInBits = MinBWs.lookup(UI);
+ if (!NewResSizeInBits)
+ continue;
+
+#ifndef NDEBUG
+ NumProcessedRecipes++;
+#endif
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. Skip those here, after incrementing NumProcessedRecipes. Also
+ // skip casts which do not need to be handled explicitly here, as
+ // redundant casts will be removed during recipe simplification.
+ if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {
+#ifndef NDEBUG
+ // If any of the operands is a live-in and not used by VPWidenRecipe or
+ // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
+ // processed as well. When MinBWs is currently constructed, there is no
+ // information about whether recipes are widened or replicated and in
+ // case they are reciplicated the operands are not truncated. Counting
+ // them them here ensures we do not miss any recipes in MinBWs.
+ // TODO: Remove once the analysis is done on VPlan.
+ for (VPValue *Op : R.operands()) {
+ if (!Op->isLiveIn())
+ continue;
+ auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());
+ if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&
+ all_of(Op->users(), [](VPUser *U) {
+ return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U);
+ })) {
+ // Add an entry to ProcessedTruncs to avoid counting the same
+ // operand multiple times.
+ ProcessedTruncs[Op] = nullptr;
+ NumProcessedRecipes += 1;
+ }
+ }
+#endif
+ continue;
+ }
+
+ Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
+ unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
+ assert(OldResTy->isIntegerTy() && "only integer types supported");
+ if (OldResSizeInBits == NewResSizeInBits)
+ continue;
+ assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
+ (void)OldResSizeInBits;
+
+ auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
+
+ // Shrink operands by introducing truncates as needed.
+ unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
+ for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
+ auto *Op = R.getOperand(Idx);
+ unsigned OpSizeInBits =
+ TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
+ if (OpSizeInBits == NewResSizeInBits)
+ continue;
+ assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
+ auto [ProcessedIter, IterIsEmpty] =
+ ProcessedTruncs.insert({Op, nullptr});
+ VPWidenCastRecipe *NewOp =
+ IterIsEmpty
+ ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
+ : ProcessedIter->second;
+ R.setOperand(Idx, NewOp);
+ if (!IterIsEmpty)
+ continue;
+ ProcessedIter->second = NewOp;
+ if (!Op->isLiveIn()) {
+ NewOp->insertBefore(&R);
+ } else {
+ PH->appendRecipe(NewOp);
+#ifndef NDEBUG
+ auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
+ bool IsContained = MinBWs.contains(OpInst);
+ NumProcessedRecipes += IsContained;
+#endif
+ }
+ }
+
+ // Any wrapping introduced by shrinking this operation shouldn't be
+ // considered undefined behavior. So, we can't unconditionally copy
+ // arithmetic wrapping flags to VPW.
+ if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
+ VPW->dropPoisonGeneratingFlags();
+
+ // Extend result to original width.
+ auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
+ Ext->insertAfter(&R);
+ ResultVPV->replaceAllUsesWith(Ext);
+ Ext->setOperand(0, ResultVPV);
+ }
+ }
+
+ assert(MinBWs.size() == NumProcessedRecipes &&
+ "some entries in MinBWs haven't been processed");
+}
+
+void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
+ removeRedundantCanonicalIVs(Plan);
+ removeRedundantInductionCasts(Plan);
+
+ optimizeInductions(Plan, SE);
+ simplifyRecipes(Plan, SE.getContext());
+ removeDeadRecipes(Plan);
+
+ createAndOptimizeReplicateRegions(Plan);
+
+ removeRedundantExpandSCEVRecipes(Plan);
+ mergeBlocksIntoPredecessors(Plan);
+}
+
+// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
+// the loop terminator with a branch-on-cond recipe with the negated
+// active-lane-mask as operand. Note that this turns the loop into an
+// uncountable one. Only the existing terminator is replaced, all other existing
+// recipes/users remain unchanged, except for poison-generating flags being
+// dropped from the canonical IV increment. Return the created
+// VPActiveLaneMaskPHIRecipe.
+//
+// The function uses the following definitions:
+//
+// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
+// calculate-trip-count-minus-VF (original TC) : original TC
+// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
+// CanonicalIVPhi : CanonicalIVIncrement
+// %StartV is the canonical induction start value.
+//
+// The function adds the following recipes:
+//
+// vector.ph:
+// %TripCount = calculate-trip-count-minus-VF (original TC)
+// [if DataWithControlFlowWithoutRuntimeCheck]
+// %EntryInc = canonical-iv-increment-for-part %StartV
+// %EntryALM = active-lane-mask %EntryInc, %TripCount
+//
+// vector.body:
+// ...
+// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
+// ...
+// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
+// %ALM = active-lane-mask %InLoopInc, TripCount
+// %Negated = Not %ALM
+// branch-on-cond %Negated
+//
+static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
+ VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
+ auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ // TODO: Check if dropping the flags is needed if
+ // !DataAndControlFlowWithoutRuntimeCheck.
+ CanonicalIVIncrement->dropPoisonGeneratingFlags();
+ DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
+ // we have to take unrolling into account. Each part needs to start at
+ // Part * VF
+ auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
+ VPBuilder Builder(VecPreheader);
+
+ // Create the ActiveLaneMask instruction using the correct start values.
+ VPValue *TC = Plan.getTripCount();
+
+ VPValue *TripCount, *IncrementValue;
+ if (!DataAndControlFlowWithoutRuntimeCheck) {
+ // When the loop is guarded by a runtime overflow check for the loop
+ // induction variable increment by VF, we can increment the value before
+ // the get.active.lane mask and use the unmodified tripcount.
+ IncrementValue = CanonicalIVIncrement;
+ TripCount = TC;
+ } else {
+ // When avoiding a runtime check, the active.lane.mask inside the loop
+ // uses a modified trip count and the induction variable increment is
+ // done after the active.lane.mask intrinsic is called.
+ IncrementValue = CanonicalIVPHI;
+ TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
+ {TC}, DL);
+ }
+ auto *EntryIncrement = Builder.createOverflowingOp(
+ VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
+ "index.part.next");
+
+ // Create the active lane mask instruction in the VPlan preheader.
+ auto *EntryALM =
+ Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
+ DL, "active.lane.mask.entry");
+
+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the
+ // preheader ActiveLaneMask instruction.
+ auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ LaneMaskPhi->insertAfter(CanonicalIVPHI);
+
+ // Create the active lane mask for the next iteration of the loop before the
+ // original terminator.
+ VPRecipeBase *OriginalTerminator = EB->getTerminator();
+ Builder.setInsertPoint(OriginalTerminator);
+ auto *InLoopIncrement =
+ Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
+ {IncrementValue}, {false, false}, DL);
+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {InLoopIncrement, TripCount}, DL,
+ "active.lane.mask.next");
+ LaneMaskPhi->addOperand(ALM);
+
+ // Replace the original terminator with BranchOnCond. We have to invert the
+ // mask here because a true condition means jumping to the exit block.
+ auto *NotMask = Builder.createNot(ALM, DL);
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
+ OriginalTerminator->eraseFromParent();
+ return LaneMaskPhi;
+}
+
+void VPlanTransforms::addActiveLaneMask(
+ VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
+ bool DataAndControlFlowWithoutRuntimeCheck) {
+ assert((!DataAndControlFlowWithoutRuntimeCheck ||
+ UseActiveLaneMaskForControlFlow) &&
+ "DataAndControlFlowWithoutRuntimeCheck implies "
+ "UseActiveLaneMaskForControlFlow");
+
+ auto FoundWidenCanonicalIVUser =
+ find_if(Plan.getCanonicalIV()->users(),
+ [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ assert(FoundWidenCanonicalIVUser &&
+ "Must have widened canonical IV when tail folding!");
+ auto *WideCanonicalIV =
+ cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+ VPRecipeBase *LaneMask;
+ if (UseActiveLaneMaskForControlFlow) {
+ LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
+ Plan, DataAndControlFlowWithoutRuntimeCheck);
+ } else {
+ LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount()},
+ nullptr, "active.lane.mask");
+ LaneMask->insertAfter(WideCanonicalIV);
+ }
+
+ // Walk users of WideCanonicalIV and replace all compares of the form
+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
+ // active-lane-mask.
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+ auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+ if (!CompareToReplace ||
+ CompareToReplace->getOpcode() != Instruction::ICmp ||
+ CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+ CompareToReplace->getOperand(1) != BTC)
+ continue;
+
+ assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+ "WidenCanonicalIV must be the first operand of the compare");
+ CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());
+ CompareToReplace->eraseFromParent();
+ }
+}