aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp571
1 files changed, 444 insertions, 127 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 26c309eed800..c23428e2ba34 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "VPlan.h"
+#include "VPlanAnalysis.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
@@ -114,6 +115,16 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPPredInstPHISC:
return false;
+ case VPInstructionSC:
+ switch (cast<VPInstruction>(this)->getOpcode()) {
+ case Instruction::ICmp:
+ case VPInstruction::Not:
+ case VPInstruction::CalculateTripCountMinusVF:
+ case VPInstruction::CanonicalIVIncrementForPart:
+ return false;
+ default:
+ return true;
+ }
case VPWidenCallSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
->mayHaveSideEffects();
@@ -156,8 +167,13 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
if (vputils::isUniformAfterVectorization(ExitValue))
Lane = VPLane::getFirstLane();
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ assert(MiddleVPBB->getNumSuccessors() == 0 &&
+ "the middle block must not have any successors");
+ BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
- State.Builder.GetInsertBlock());
+ MiddleBB);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -216,15 +232,55 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
+ assert(OpType == OperationType::FPMathOp &&
+ "recipe doesn't have fast math flags");
+ FastMathFlags Res;
+ Res.setAllowReassoc(FMFs.AllowReassoc);
+ Res.setNoNaNs(FMFs.NoNaNs);
+ Res.setNoInfs(FMFs.NoInfs);
+ Res.setNoSignedZeros(FMFs.NoSignedZeros);
+ Res.setAllowReciprocal(FMFs.AllowReciprocal);
+ Res.setAllowContract(FMFs.AllowContract);
+ Res.setApproxFunc(FMFs.ApproxFunc);
+ return Res;
+}
+
+VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
+ VPValue *A, VPValue *B, DebugLoc DL,
+ const Twine &Name)
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
+ Pred, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {
+ assert(Opcode == Instruction::ICmp &&
+ "only ICmp predicates supported at the moment");
+}
+
+VPInstruction::VPInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
+ VPValue(this), Opcode(Opcode), Name(Name.str()) {
+ // Make sure the VPInstruction is a floating-point operation.
+ assert(isFPMathOp() && "this op can't take fast-math flags");
+}
+
Value *VPInstruction::generateInstruction(VPTransformState &State,
unsigned Part) {
IRBuilderBase &Builder = State.Builder;
- Builder.SetCurrentDebugLocation(DL);
+ Builder.SetCurrentDebugLocation(getDebugLoc());
if (Instruction::isBinaryOp(getOpcode())) {
+ if (Part != 0 && vputils::onlyFirstPartUsed(this))
+ return State.get(this, 0);
+
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
- return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
+ auto *Res =
+ Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
+ if (auto *I = dyn_cast<Instruction>(Res))
+ setFlags(I);
+ return Res;
}
switch (getOpcode()) {
@@ -232,10 +288,10 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *A = State.get(getOperand(0), Part);
return Builder.CreateNot(A, Name);
}
- case VPInstruction::ICmpULE: {
- Value *IV = State.get(getOperand(0), Part);
- Value *TC = State.get(getOperand(1), Part);
- return Builder.CreateICmpULE(IV, TC, Name);
+ case Instruction::ICmp: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::Select: {
Value *Cond = State.get(getOperand(0), Part);
@@ -285,23 +341,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
return Builder.CreateSelect(Cmp, Sub, Zero);
}
- case VPInstruction::CanonicalIVIncrement:
- case VPInstruction::CanonicalIVIncrementNUW: {
- if (Part == 0) {
- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
- auto *Phi = State.get(getOperand(0), 0);
- // The loop step is equal to the vectorization factor (num of SIMD
- // elements) times the unroll factor (num of SIMD instructions).
- Value *Step =
- createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
- return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
- }
- return State.get(this, 0);
- }
-
- case VPInstruction::CanonicalIVIncrementForPart:
- case VPInstruction::CanonicalIVIncrementForPartNUW: {
- bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
+ case VPInstruction::CanonicalIVIncrementForPart: {
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
if (Part == 0)
return IV;
@@ -309,7 +349,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
// The canonical IV is incremented by the vectorization factor (num of SIMD
// elements) times the unroll part.
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
- return Builder.CreateAdd(IV, Step, Name, IsNUW, false);
+ return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
+ hasNoSignedWrap());
}
case VPInstruction::BranchOnCond: {
if (Part != 0)
@@ -361,10 +402,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
}
}
+#if !defined(NDEBUG)
+bool VPInstruction::isFPMathOp() const {
+ // Inspired by FPMathOperator::classof. Notable differences are that we don't
+ // support Call, PHI and Select opcodes here yet.
+ return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+ Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
+ Opcode == Instruction::FCmp || Opcode == Instruction::Select;
+}
+#endif
+
void VPInstruction::execute(VPTransformState &State) {
assert(!State.Instance && "VPInstruction executing an Instance");
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(FMF);
+ assert((hasFastMathFlags() == isFPMathOp() ||
+ getOpcode() == Instruction::Select) &&
+ "Recipe not a FPMathOp but has fast-math flags?");
+ if (hasFastMathFlags())
+ State.Builder.setFastMathFlags(getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *GeneratedValue = generateInstruction(State, Part);
if (!hasResult())
@@ -393,9 +449,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Not:
O << "not";
break;
- case VPInstruction::ICmpULE:
- O << "icmp ule";
- break;
case VPInstruction::SLPLoad:
O << "combined load";
break;
@@ -408,12 +461,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
- case VPInstruction::CanonicalIVIncrement:
- O << "VF * UF + ";
- break;
- case VPInstruction::CanonicalIVIncrementNUW:
- O << "VF * UF +(nuw) ";
- break;
case VPInstruction::BranchOnCond:
O << "branch-on-cond";
break;
@@ -421,49 +468,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
O << "TC > VF ? TC - VF : 0";
break;
case VPInstruction::CanonicalIVIncrementForPart:
- O << "VF * Part + ";
- break;
- case VPInstruction::CanonicalIVIncrementForPartNUW:
- O << "VF * Part +(nuw) ";
+ O << "VF * Part +";
break;
case VPInstruction::BranchOnCount:
- O << "branch-on-count ";
+ O << "branch-on-count";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
- O << FMF;
-
- for (const VPValue *Operand : operands()) {
- O << " ";
- Operand->printAsOperand(O, SlotTracker);
- }
+ printFlags(O);
+ printOperands(O, SlotTracker);
- if (DL) {
+ if (auto DL = getDebugLoc()) {
O << ", !dbg ";
DL.print(O);
}
}
#endif
-void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
- // Make sure the VPInstruction is a floating-point operation.
- assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
- Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
- Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
- Opcode == Instruction::FCmp) &&
- "this op can't take fast-math flags");
- FMF = FMFNew;
-}
-
void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
auto &CI = *cast<CallInst>(getUnderlyingInstr());
assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
- State.setDebugLocFromInst(&CI);
+ State.setDebugLocFrom(CI.getDebugLoc());
+ FunctionType *VFTy = nullptr;
+ if (Variant)
+ VFTy = Variant->getFunctionType();
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
@@ -475,12 +508,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
+ // Some vectorized function variants may also take a scalar argument,
+ // e.g. linear parameters for pointers.
Value *Arg;
- if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
- !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
- Arg = State.get(I.value(), Part);
- else
+ if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
+ (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
Arg = State.get(I.value(), VPIteration(0, 0));
+ else
+ Arg = State.get(I.value(), Part);
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
@@ -553,8 +589,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- auto &I = *cast<SelectInst>(getUnderlyingInstr());
- State.setDebugLocFromInst(&I);
+ State.setDebugLocFrom(getDebugLoc());
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
@@ -569,13 +604,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
Value *Op1 = State.get(getOperand(2), Part);
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
State.set(this, Sel, Part);
- State.addMetadata(Sel, &I);
+ State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
}
+VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
+ const FastMathFlags &FMF) {
+ AllowReassoc = FMF.allowReassoc();
+ NoNaNs = FMF.noNaNs();
+ NoInfs = FMF.noInfs();
+ NoSignedZeros = FMF.noSignedZeros();
+ AllowReciprocal = FMF.allowReciprocal();
+ AllowContract = FMF.allowContract();
+ ApproxFunc = FMF.approxFunc();
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
switch (OpType) {
+ case OperationType::Cmp:
+ O << " " << CmpInst::getPredicateName(getPredicate());
+ break;
+ case OperationType::DisjointOp:
+ if (DisjointFlags.IsDisjoint)
+ O << " disjoint";
+ break;
case OperationType::PossiblyExactOp:
if (ExactFlags.IsExact)
O << " exact";
@@ -593,17 +646,22 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
if (GEPFlags.IsInBounds)
O << " inbounds";
break;
+ case OperationType::NonNegOp:
+ if (NonNegFlags.NonNeg)
+ O << " nneg";
+ break;
case OperationType::Other:
break;
}
- O << " ";
+ if (getNumOperands() > 0)
+ O << " ";
}
#endif
void VPWidenRecipe::execute(VPTransformState &State) {
- auto &I = *cast<Instruction>(getUnderlyingValue());
+ State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
- switch (I.getOpcode()) {
+ switch (Opcode) {
case Instruction::Call:
case Instruction::Br:
case Instruction::PHI:
@@ -630,28 +688,24 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::Or:
case Instruction::Xor: {
// Just widen unops and binops.
- State.setDebugLocFromInst(&I);
-
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Value *, 2> Ops;
for (VPValue *VPOp : operands())
Ops.push_back(State.get(VPOp, Part));
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+ Value *V = Builder.CreateNAryOp(Opcode, Ops);
if (auto *VecOp = dyn_cast<Instruction>(V))
setFlags(VecOp);
// Use this vector value for all users of the original instruction.
State.set(this, V, Part);
- State.addMetadata(V, &I);
+ State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
break;
}
case Instruction::Freeze: {
- State.setDebugLocFromInst(&I);
-
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *Op = State.get(getOperand(0), Part);
@@ -663,9 +717,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- State.setDebugLocFromInst(Cmp);
+ bool FCmp = Opcode == Instruction::FCmp;
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -673,51 +725,64 @@ void VPWidenRecipe::execute(VPTransformState &State) {
if (FCmp) {
// Propagate fast math flags.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
+ Builder.setFastMathFlags(I->getFastMathFlags());
+ C = Builder.CreateFCmp(getPredicate(), A, B);
} else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ C = Builder.CreateICmp(getPredicate(), A, B);
}
State.set(this, C, Part);
- State.addMetadata(C, &I);
+ State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
}
break;
}
default:
// This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
+ << Instruction::getOpcodeName(Opcode));
llvm_unreachable("Unhandled instruction!");
} // end of switch.
+
+#if !defined(NDEBUG)
+ // Verify that VPlan type inference results agree with the type of the
+ // generated values.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
+ State.VF) == State.get(this, Part)->getType() &&
+ "inferred type and type from generated instructions do not match");
+ }
+#endif
}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
- const Instruction *UI = getUnderlyingInstr();
- O << " = " << UI->getOpcodeName();
+ O << " = " << Instruction::getOpcodeName(Opcode);
printFlags(O);
- if (auto *Cmp = dyn_cast<CmpInst>(UI))
- O << Cmp->getPredicate() << " ";
printOperands(O, SlotTracker);
}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
- auto *I = cast_or_null<Instruction>(getUnderlyingValue());
- if (I)
- State.setDebugLocFromInst(I);
+ State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
/// Vectorize casts.
assert(State.VF.isVector() && "Not vectorizing?");
Type *DestTy = VectorType::get(getResultType(), State.VF);
-
+ VPValue *Op = getOperand(0);
for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *A = State.get(getOperand(0), Part);
+ if (Part > 0 && Op->isLiveIn()) {
+ // FIXME: Remove once explicit unrolling is implemented using VPlan.
+ State.set(this, State.get(this, 0), Part);
+ continue;
+ }
+ Value *A = State.get(Op, Part);
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
State.set(this, Cast, Part);
- State.addMetadata(Cast, I);
+ State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
}
}
@@ -727,10 +792,182 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN-CAST ";
printAsOperand(O, SlotTracker);
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+ printFlags(O);
printOperands(O, SlotTracker);
O << " to " << *getResultType();
}
+#endif
+
+/// This function adds
+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
+/// to each vector element of Val. The sequence starts at StartIndex.
+/// \p Opcode is relevant for FP induction variable.
+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp, ElementCount VF,
+ IRBuilderBase &Builder) {
+ assert(VF.isVector() && "only vector VFs are supported");
+
+ // Create and check the types.
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ ElementCount VLen = ValVTy->getElementCount();
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ // Create a vector of consecutive numbers from zero to VF.
+ VectorType *InitVecValVTy = ValVTy;
+ if (STy->isFloatingPointTy()) {
+ Type *InitVecValSTy =
+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+ }
+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+ // Splat the StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
+
+ if (STy->isIntegerTy()) {
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw
+ // flags, which can be found from the original scalar operations.
+ Step = Builder.CreateMul(InitVec, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ Value *MulOp = Builder.CreateFMul(InitVec, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+ : ConstantFP::get(Ty, C);
+}
+
+static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
+ ElementCount VF) {
+ assert(FTy->isFloatingPointTy() && "Expected floating point type!");
+ Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
+ Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
+ return B.CreateUIToFP(RuntimeVF, FTy);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+
+ Value *Start = getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = getInductionDescriptor();
+ TruncInst *Trunc = getTruncInst();
+ IRBuilderBase &Builder = State.Builder;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(State.VF.isVector() && "must have vector VF");
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
+
+ // Now do the actual transformations, and start with fetching the step value.
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+
+ Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
+ Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
+ Type *StepType = Step->getType();
+ Value *RuntimeVF;
+ if (Step->getType()->isFloatingPointTy())
+ RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+ else
+ RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+ Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
+ Value *SplatVF = isa<Constant>(Mul)
+ ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(State.VF, Mul);
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
+ VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ State.set(this, LastInduction, Part);
+
+ if (isa<TruncInst>(EntryVal))
+ State.addMetadata(LastInduction, EntryVal);
+
+ LastInduction = cast<Instruction>(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ LastInduction->setName("vec.ind.next");
+ VecInd->addIncoming(SteppedStart, VectorPH);
+ // Add induction update using an incorrect block temporarily. The phi node
+ // will be fixed after VPlan execution. Note that at this point the latch
+ // block cannot be used, as it does not exist yet.
+ // TODO: Model increment value in VPlan, by turning the recipe into a
+ // multi-def and a subclass of VPHeaderPHIRecipe.
+ VecInd->addIncoming(LastInduction, VectorPH);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-INDUCTION";
@@ -770,17 +1007,112 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
O << " * ";
getStepValue()->printAsOperand(O, SlotTracker);
- if (IndDesc.getStep()->getType() != ResultTy)
- O << " (truncated to " << *ResultTy << ")";
+ if (TruncResultTy)
+ O << " (truncated to " << *TruncResultTy << ")";
}
#endif
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+ if (hasFastMathFlags())
+ State.Builder.setFastMathFlags(getFastMathFlags());
+
+ /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+ /// variable on which to base the steps, \p Step is the size of the step.
+
+ Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+ IRBuilderBase &Builder = State.Builder;
+
+ // Ensure step has the same type as that of scalar IV.
+ Type *BaseIVTy = BaseIV->getType()->getScalarType();
+ if (BaseIVTy != Step->getType()) {
+ // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
+ // avoid separate truncate here.
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ Step = State.Builder.CreateTrunc(Step, BaseIVTy);
+ }
+
+ // We build scalar steps for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (BaseIVTy->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = InductionOpcode;
+ MulOp = Instruction::FMul;
+ }
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration.
+ bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
+ // Compute the scalar steps and save the results in State.
+ Type *IntStepTy =
+ IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
+ Type *VecIVTy = nullptr;
+ Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
+ if (!FirstLaneOnly && State.VF.isScalable()) {
+ VecIVTy = VectorType::get(BaseIVTy, State.VF);
+ UnitStepVec =
+ Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
+ SplatStep = Builder.CreateVectorSplat(State.VF, Step);
+ SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
+ }
+
+ unsigned StartPart = 0;
+ unsigned EndPart = State.UF;
+ unsigned StartLane = 0;
+ unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
+ if (State.Instance) {
+ StartPart = State.Instance->Part;
+ EndPart = StartPart + 1;
+ StartLane = State.Instance->Lane.getKnownLane();
+ EndLane = StartLane + 1;
+ }
+ for (unsigned Part = StartPart; Part < EndPart; ++Part) {
+ Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
+
+ if (!FirstLaneOnly && State.VF.isScalable()) {
+ auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
+ auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
+ if (BaseIVTy->isFloatingPointTy())
+ InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
+ auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
+ auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
+ State.set(this, Add, Part);
+ // It's useful to record the lane values too for the known minimum number
+ // of elements so we do those below. This improves the code quality when
+ // trying to extract the first element, for example.
+ }
+
+ if (BaseIVTy->isFloatingPointTy())
+ StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
+
+ for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
+ Value *StartIdx = Builder.CreateBinOp(
+ AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
+ // The step returned by `createStepForVF` is a runtime-evaluated value
+ // when VF is scalable. Otherwise, it should be folded into a Constant.
+ assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
+ "Expected StartIdx to be folded to a constant when VF is not "
+ "scalable");
+ auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+ auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
+ State.set(this, Add, VPIteration(Part, Lane));
+ }
+ }
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << Indent << "= SCALAR-STEPS ";
+ O << " = SCALAR-STEPS ";
printOperands(O, SlotTracker);
}
#endif
@@ -874,7 +1206,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPBlendRecipe::execute(VPTransformState &State) {
- State.setDebugLocFromInst(Phi);
+ State.setDebugLocFrom(getDebugLoc());
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
@@ -916,7 +1248,7 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "BLEND ";
- Phi->printAsOperand(O, false);
+ printAsOperand(O, SlotTracker);
O << " =";
if (getNumIncomingValues() == 1) {
// Not a User of any mask: not really blending, this is a
@@ -942,14 +1274,14 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " +";
if (isa<FPMathOperator>(getUnderlyingInstr()))
O << getUnderlyingInstr()->getFastMathFlags();
- O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
+ O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
getVecOp()->printAsOperand(O, SlotTracker);
if (getCondOp()) {
O << ", ";
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
- if (RdxDesc->IntermediateStore)
+ if (RdxDesc.IntermediateStore)
O << " (with final reduction value stored in invariant address sank "
"outside of loop)";
}
@@ -1093,12 +1425,12 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
Value *Start = getStartValue()->getLiveInIRValue();
- PHINode *EntryPart = PHINode::Create(
- Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+ PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
EntryPart->addIncoming(Start, VectorPH);
- EntryPart->setDebugLoc(DL);
+ EntryPart->setDebugLoc(getDebugLoc());
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
State.set(this, EntryPart, Part);
}
@@ -1108,7 +1440,8 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
- O << " = CANONICAL-INDUCTION";
+ O << " = CANONICAL-INDUCTION ";
+ printOperands(O, SlotTracker);
}
#endif
@@ -1221,8 +1554,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
}
// Create a phi node for the new recurrence.
- PHINode *EntryPart = PHINode::Create(
- VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
+ PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
+ EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
EntryPart->addIncoming(VectorInit, VectorPH);
State.set(this, EntryPart, 0);
}
@@ -1254,8 +1587,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
"recipe must be in the vector loop header");
unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *EntryPart =
- PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
+ Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
+ EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
State.set(this, EntryPart, Part);
}
@@ -1269,8 +1602,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
Value *Iden = nullptr;
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
- RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
- // MinMax reduction have the start value as their identify.
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ // MinMax and AnyOf reductions have the start value as their identity.
if (ScalarPHI) {
Iden = StartV;
} else {
@@ -1316,23 +1649,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
assert(EnableVPlanNativePath &&
"Non-native vplans are not expected to have VPWidenPHIRecipes.");
- // Currently we enter here in the VPlan-native path for non-induction
- // PHIs where all control flow is uniform. We simply widen these PHIs.
- // Create a vector phi with no operands - the vector phi operands will be
- // set at the end of vector code generation.
- VPBasicBlock *Parent = getParent();
- VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
- unsigned StartIdx = 0;
- // For phis in header blocks of loop regions, use the index of the value
- // coming from the preheader.
- if (LoopRegion->getEntryBasicBlock() == Parent) {
- for (unsigned I = 0; I < getNumOperands(); ++I) {
- if (getIncomingBlock(I) ==
- LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
- StartIdx = I;
- }
- }
- Value *Op0 = State.get(getOperand(StartIdx), 0);
+ Value *Op0 = State.get(getOperand(0), 0);
Type *VecTy = Op0->getType();
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
State.set(this, VecPhi, 0);
@@ -1368,7 +1685,7 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
PHINode *EntryPart =
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
EntryPart->addIncoming(StartMask, VectorPH);
- EntryPart->setDebugLoc(DL);
+ EntryPart->setDebugLoc(getDebugLoc());
State.set(this, EntryPart, Part);
}
}