diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 571 |
1 files changed, 444 insertions, 127 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 26c309eed800..c23428e2ba34 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "VPlan.h" +#include "VPlanAnalysis.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -114,6 +115,16 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPPredInstPHISC: return false; + case VPInstructionSC: + switch (cast<VPInstruction>(this)->getOpcode()) { + case Instruction::ICmp: + case VPInstruction::Not: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + return false; + default: + return true; + } case VPWidenCallSC: return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) ->mayHaveSideEffects(); @@ -156,8 +167,13 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); if (vputils::isUniformAfterVectorization(ExitValue)) Lane = VPLane::getFirstLane(); + VPBasicBlock *MiddleVPBB = + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); + assert(MiddleVPBB->getNumSuccessors() == 0 && + "the middle block must not have any successors"); + BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), - State.Builder.GetInsertBlock()); + MiddleBB); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -216,15 +232,55 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, insertBefore(BB, I); } +FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { + assert(OpType == OperationType::FPMathOp && + "recipe doesn't have fast math flags"); + FastMathFlags Res; + Res.setAllowReassoc(FMFs.AllowReassoc); + Res.setNoNaNs(FMFs.NoNaNs); + Res.setNoInfs(FMFs.NoInfs); + Res.setNoSignedZeros(FMFs.NoSignedZeros); + Res.setAllowReciprocal(FMFs.AllowReciprocal); + Res.setAllowContract(FMFs.AllowContract); + Res.setApproxFunc(FMFs.ApproxFunc); + return Res; +} + +VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, + VPValue *A, VPValue *B, DebugLoc DL, + const Twine &Name) + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), + Pred, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) { + assert(Opcode == Instruction::ICmp && + "only ICmp predicates supported at the moment"); +} + +VPInstruction::VPInstruction(unsigned Opcode, + std::initializer_list<VPValue *> Operands, + FastMathFlags FMFs, DebugLoc DL, const Twine &Name) + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), + VPValue(this), Opcode(Opcode), Name(Name.str()) { + // Make sure the VPInstruction is a floating-point operation. + assert(isFPMathOp() && "this op can't take fast-math flags"); +} + Value *VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilderBase &Builder = State.Builder; - Builder.SetCurrentDebugLocation(DL); + Builder.SetCurrentDebugLocation(getDebugLoc()); if (Instruction::isBinaryOp(getOpcode())) { + if (Part != 0 && vputils::onlyFirstPartUsed(this)) + return State.get(this, 0); + Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); + auto *Res = + Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); + if (auto *I = dyn_cast<Instruction>(Res)) + setFlags(I); + return Res; } switch (getOpcode()) { @@ -232,10 +288,10 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Value *A = State.get(getOperand(0), Part); return Builder.CreateNot(A, Name); } - case VPInstruction::ICmpULE: { - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - return Builder.CreateICmpULE(IV, TC, Name); + case Instruction::ICmp: { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { Value *Cond = State.get(getOperand(0), Part); @@ -285,23 +341,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); return Builder.CreateSelect(Cmp, Sub, Zero); } - case VPInstruction::CanonicalIVIncrement: - case VPInstruction::CanonicalIVIncrementNUW: { - if (Part == 0) { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; - auto *Phi = State.get(getOperand(0), 0); - // The loop step is equal to the vectorization factor (num of SIMD - // elements) times the unroll factor (num of SIMD instructions). - Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - return Builder.CreateAdd(Phi, Step, Name, IsNUW, false); - } - return State.get(this, 0); - } - - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::CanonicalIVIncrementForPartNUW: { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + case VPInstruction::CanonicalIVIncrementForPart: { auto *IV = State.get(getOperand(0), VPIteration(0, 0)); if (Part == 0) return IV; @@ -309,7 +349,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, // The canonical IV is incremented by the vectorization factor (num of SIMD // elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); - return Builder.CreateAdd(IV, Step, Name, IsNUW, false); + return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), + hasNoSignedWrap()); } case VPInstruction::BranchOnCond: { if (Part != 0) @@ -361,10 +402,25 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, } } +#if !defined(NDEBUG) +bool VPInstruction::isFPMathOp() const { + // Inspired by FPMathOperator::classof. Notable differences are that we don't + // support Call, PHI and Select opcodes here yet. + return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp || Opcode == Instruction::Select; +} +#endif + void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(FMF); + assert((hasFastMathFlags() == isFPMathOp() || + getOpcode() == Instruction::Select) && + "Recipe not a FPMathOp but has fast-math flags?"); + if (hasFastMathFlags()) + State.Builder.setFastMathFlags(getFastMathFlags()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *GeneratedValue = generateInstruction(State, Part); if (!hasResult()) @@ -393,9 +449,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Not: O << "not"; break; - case VPInstruction::ICmpULE: - O << "icmp ule"; - break; case VPInstruction::SLPLoad: O << "combined load"; break; @@ -408,12 +461,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; - case VPInstruction::CanonicalIVIncrement: - O << "VF * UF + "; - break; - case VPInstruction::CanonicalIVIncrementNUW: - O << "VF * UF +(nuw) "; - break; case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; @@ -421,49 +468,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << "TC > VF ? TC - VF : 0"; break; case VPInstruction::CanonicalIVIncrementForPart: - O << "VF * Part + "; - break; - case VPInstruction::CanonicalIVIncrementForPartNUW: - O << "VF * Part +(nuw) "; + O << "VF * Part +"; break; case VPInstruction::BranchOnCount: - O << "branch-on-count "; + O << "branch-on-count"; break; default: O << Instruction::getOpcodeName(getOpcode()); } - O << FMF; - - for (const VPValue *Operand : operands()) { - O << " "; - Operand->printAsOperand(O, SlotTracker); - } + printFlags(O); + printOperands(O, SlotTracker); - if (DL) { + if (auto DL = getDebugLoc()) { O << ", !dbg "; DL.print(O); } } #endif -void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { - // Make sure the VPInstruction is a floating-point operation. - assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || - Opcode == Instruction::FNeg || Opcode == Instruction::FSub || - Opcode == Instruction::FDiv || Opcode == Instruction::FRem || - Opcode == Instruction::FCmp) && - "this op can't take fast-math flags"); - FMF = FMFNew; -} - void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); auto &CI = *cast<CallInst>(getUnderlyingInstr()); assert(!isa<DbgInfoIntrinsic>(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); - State.setDebugLocFromInst(&CI); + State.setDebugLocFrom(CI.getDebugLoc()); + FunctionType *VFTy = nullptr; + if (Variant) + VFTy = Variant->getFunctionType(); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Type *, 2> TysForDecl; // Add return type if intrinsic is overloaded on it. @@ -475,12 +508,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. Value *Arg; - if (VectorIntrinsicID == Intrinsic::not_intrinsic || - !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) - Arg = State.get(I.value(), Part); - else + if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || + (VectorIntrinsicID != Intrinsic::not_intrinsic && + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) Arg = State.get(I.value(), VPIteration(0, 0)); + else + Arg = State.get(I.value(), Part); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); @@ -553,8 +589,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); + State.setDebugLocFrom(getDebugLoc()); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -569,13 +604,31 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel, Part); - State.addMetadata(Sel, &I); + State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } } +VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( + const FastMathFlags &FMF) { + AllowReassoc = FMF.allowReassoc(); + NoNaNs = FMF.noNaNs(); + NoInfs = FMF.noInfs(); + NoSignedZeros = FMF.noSignedZeros(); + AllowReciprocal = FMF.allowReciprocal(); + AllowContract = FMF.allowContract(); + ApproxFunc = FMF.approxFunc(); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { switch (OpType) { + case OperationType::Cmp: + O << " " << CmpInst::getPredicateName(getPredicate()); + break; + case OperationType::DisjointOp: + if (DisjointFlags.IsDisjoint) + O << " disjoint"; + break; case OperationType::PossiblyExactOp: if (ExactFlags.IsExact) O << " exact"; @@ -593,17 +646,22 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { if (GEPFlags.IsInBounds) O << " inbounds"; break; + case OperationType::NonNegOp: + if (NonNegFlags.NonNeg) + O << " nneg"; + break; case OperationType::Other: break; } - O << " "; + if (getNumOperands() > 0) + O << " "; } #endif void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); + State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; - switch (I.getOpcode()) { + switch (Opcode) { case Instruction::Call: case Instruction::Br: case Instruction::PHI: @@ -630,28 +688,24 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. - State.setDebugLocFromInst(&I); - for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector<Value *, 2> Ops; for (VPValue *VPOp : operands()) Ops.push_back(State.get(VPOp, Part)); - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + Value *V = Builder.CreateNAryOp(Opcode, Ops); if (auto *VecOp = dyn_cast<Instruction>(V)) setFlags(VecOp); // Use this vector value for all users of the original instruction. State.set(this, V, Part); - State.addMetadata(V, &I); + State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } break; } case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - for (unsigned Part = 0; Part < State.UF; ++Part) { Value *Op = State.get(getOperand(0), Part); @@ -663,9 +717,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); + bool FCmp = Opcode == Instruction::FCmp; for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); @@ -673,51 +725,64 @@ void VPWidenRecipe::execute(VPTransformState &State) { if (FCmp) { // Propagate fast math flags. IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) + Builder.setFastMathFlags(I->getFastMathFlags()); + C = Builder.CreateFCmp(getPredicate(), A, B); } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + C = Builder.CreateICmp(getPredicate(), A, B); } State.set(this, C, Part); - State.addMetadata(C, &I); + State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); } break; } default: // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " + << Instruction::getOpcodeName(Opcode)); llvm_unreachable("Unhandled instruction!"); } // end of switch. + +#if !defined(NDEBUG) + // Verify that VPlan type inference results agree with the type of the + // generated values. + for (unsigned Part = 0; Part < State.UF; ++Part) { + assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), + State.VF) == State.get(this, Part)->getType() && + "inferred type and type from generated instructions do not match"); + } +#endif } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); - const Instruction *UI = getUnderlyingInstr(); - O << " = " << UI->getOpcodeName(); + O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); - if (auto *Cmp = dyn_cast<CmpInst>(UI)) - O << Cmp->getPredicate() << " "; printOperands(O, SlotTracker); } #endif void VPWidenCastRecipe::execute(VPTransformState &State) { - auto *I = cast_or_null<Instruction>(getUnderlyingValue()); - if (I) - State.setDebugLocFromInst(I); + State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; /// Vectorize casts. assert(State.VF.isVector() && "Not vectorizing?"); Type *DestTy = VectorType::get(getResultType(), State.VF); - + VPValue *Op = getOperand(0); for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); + if (Part > 0 && Op->isLiveIn()) { + // FIXME: Remove once explicit unrolling is implemented using VPlan. + State.set(this, State.get(this, 0), Part); + continue; + } + Value *A = State.get(Op, Part); Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); State.set(this, Cast, Part); - State.addMetadata(Cast, I); + State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); } } @@ -727,10 +792,182 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode) << " "; + printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); } +#endif + +/// This function adds +/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) +/// to each vector element of Val. The sequence starts at StartIndex. +/// \p Opcode is relevant for FP induction variable. +static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps BinOp, ElementCount VF, + IRBuilderBase &Builder) { + assert(VF.isVector() && "only vector VFs are supported"); + + // Create and check the types. + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + + SmallVector<Constant *, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + if (STy->isFloatingPointTy()) { + Type *InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); + + if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw + // flags, which can be found from the original scalar operations. + Step = Builder.CreateMul(InitVec, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + + Step = Builder.CreateVectorSplat(VLen, Step); + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); +} + +/// A helper function that returns an integer or floating-point constant with +/// value C. +static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { + return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) + : ConstantFP::get(Ty, C); +} + +static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, + ElementCount VF) { + assert(FTy->isFloatingPointTy() && "Expected floating point type!"); + Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); + return B.CreateUIToFP(RuntimeVF, FTy); +} + +void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Int or FP induction being replicated."); + + Value *Start = getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = getInductionDescriptor(); + TruncInst *Trunc = getTruncInst(); + IRBuilderBase &Builder = State.Builder; + assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(State.VF.isVector() && "must have vector VF"); + + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + + // Now do the actual transformations, and start with fetching the step value. + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // Construct the initial value of the vector IV in the vector loop preheader + auto CurrIP = Builder.saveIP(); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Builder.SetInsertPoint(VectorPH->getTerminator()); + if (isa<TruncInst>(EntryVal)) { + assert(Start->getType()->isIntegerTy() && + "Truncation requires an integer type"); + auto *TruncType = cast<IntegerType>(EntryVal->getType()); + Step = Builder.CreateTrunc(Step, TruncType); + Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); + } + + Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); + + // We create vector phi nodes for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (Step->getType()->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = ID.getInductionOpcode(); + MulOp = Instruction::FMul; + } + + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't + // handle a constant vector splat. + Value *SplatVF = isa<Constant>(Mul) + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + Builder.restoreIP(CurrIP); + + // We may need to add the step a number of times, depending on the unroll + // factor. The last of those goes into the PHI. + PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); + VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); + Instruction *LastInduction = VecInd; + for (unsigned Part = 0; Part < State.UF; ++Part) { + State.set(this, LastInduction, Part); + + if (isa<TruncInst>(EntryVal)) + State.addMetadata(LastInduction, EntryVal); + + LastInduction = cast<Instruction>( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); + } + + LastInduction->setName("vec.ind.next"); + VecInd->addIncoming(SteppedStart, VectorPH); + // Add induction update using an incorrect block temporarily. The phi node + // will be fixed after VPlan execution. Note that at this point the latch + // block cannot be used, as it does not exist yet. + // TODO: Model increment value in VPlan, by turning the recipe into a + // multi-def and a subclass of VPHeaderPHIRecipe. + VecInd->addIncoming(LastInduction, VectorPH); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INDUCTION"; @@ -770,17 +1007,112 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, O << " * "; getStepValue()->printAsOperand(O, SlotTracker); - if (IndDesc.getStep()->getType() != ResultTy) - O << " (truncated to " << *ResultTy << ")"; + if (TruncResultTy) + O << " (truncated to " << *TruncResultTy << ")"; } #endif +void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + if (hasFastMathFlags()) + State.Builder.setFastMathFlags(getFastMathFlags()); + + /// Compute scalar induction steps. \p ScalarIV is the scalar induction + /// variable on which to base the steps, \p Step is the size of the step. + + Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + IRBuilderBase &Builder = State.Builder; + + // Ensure step has the same type as that of scalar IV. + Type *BaseIVTy = BaseIV->getType()->getScalarType(); + if (BaseIVTy != Step->getType()) { + // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to + // avoid separate truncate here. + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + Step = State.Builder.CreateTrunc(Step, BaseIVTy); + } + + // We build scalar steps for both integer and floating-point induction + // variables. Here, we determine the kind of arithmetic we will perform. + Instruction::BinaryOps AddOp; + Instruction::BinaryOps MulOp; + if (BaseIVTy->isIntegerTy()) { + AddOp = Instruction::Add; + MulOp = Instruction::Mul; + } else { + AddOp = InductionOpcode; + MulOp = Instruction::FMul; + } + + // Determine the number of scalars we need to generate for each unroll + // iteration. + bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); + // Compute the scalar steps and save the results in State. + Type *IntStepTy = + IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); + Type *VecIVTy = nullptr; + Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; + if (!FirstLaneOnly && State.VF.isScalable()) { + VecIVTy = VectorType::get(BaseIVTy, State.VF); + UnitStepVec = + Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); + SplatStep = Builder.CreateVectorSplat(State.VF, Step); + SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); + } + + unsigned StartPart = 0; + unsigned EndPart = State.UF; + unsigned StartLane = 0; + unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); + if (State.Instance) { + StartPart = State.Instance->Part; + EndPart = StartPart + 1; + StartLane = State.Instance->Lane.getKnownLane(); + EndLane = StartLane + 1; + } + for (unsigned Part = StartPart; Part < EndPart; ++Part) { + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); + + if (!FirstLaneOnly && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); + if (BaseIVTy->isFloatingPointTy()) + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(this, Add, Part); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (BaseIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); + + for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); + State.set(this, Add, VPIteration(Part, Lane)); + } + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); - O << Indent << "= SCALAR-STEPS "; + O << " = SCALAR-STEPS "; printOperands(O, SlotTracker); } #endif @@ -874,7 +1206,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); + State.setDebugLocFrom(getDebugLoc()); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -916,7 +1248,7 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; - Phi->printAsOperand(O, false); + printAsOperand(O, SlotTracker); O << " ="; if (getNumIncomingValues() == 1) { // Not a User of any mask: not really blending, this is a @@ -942,14 +1274,14 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " +"; if (isa<FPMathOperator>(getUnderlyingInstr())) O << getUnderlyingInstr()->getFastMathFlags(); - O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; + O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; - if (RdxDesc->IntermediateStore) + if (RdxDesc.IntermediateStore) O << " (with final reduction value stored in invariant address sank " "outside of loop)"; } @@ -1093,12 +1425,12 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); - PHINode *EntryPart = PHINode::Create( - Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); + EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); EntryPart->addIncoming(Start, VectorPH); - EntryPart->setDebugLoc(DL); + EntryPart->setDebugLoc(getDebugLoc()); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(this, EntryPart, Part); } @@ -1108,7 +1440,8 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION"; + O << " = CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); } #endif @@ -1221,8 +1554,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { } // Create a phi node for the new recurrence. - PHINode *EntryPart = PHINode::Create( - VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); + PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); + EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); EntryPart->addIncoming(VectorInit, VectorPH); State.set(this, EntryPart, 0); } @@ -1254,8 +1587,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { "recipe must be in the vector loop header"); unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = - PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); + Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); + EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); State.set(this, EntryPart, Part); } @@ -1269,8 +1602,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + // MinMax and AnyOf reductions have the start value as their identity. if (ScalarPHI) { Iden = StartV; } else { @@ -1316,23 +1649,7 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. - // Create a vector phi with no operands - the vector phi operands will be - // set at the end of vector code generation. - VPBasicBlock *Parent = getParent(); - VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion(); - unsigned StartIdx = 0; - // For phis in header blocks of loop regions, use the index of the value - // coming from the preheader. - if (LoopRegion->getEntryBasicBlock() == Parent) { - for (unsigned I = 0; I < getNumOperands(); ++I) { - if (getIncomingBlock(I) == - LoopRegion->getSinglePredecessor()->getExitingBasicBlock()) - StartIdx = I; - } - } - Value *Op0 = State.get(getOperand(StartIdx), 0); + Value *Op0 = State.get(getOperand(0), 0); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); State.set(this, VecPhi, 0); @@ -1368,7 +1685,7 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { PHINode *EntryPart = State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); EntryPart->addIncoming(StartMask, VectorPH); - EntryPart->setDebugLoc(DL); + EntryPart->setDebugLoc(getDebugLoc()); State.set(this, EntryPart, Part); } } |
