diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 385 | 
1 files changed, 224 insertions, 161 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4e9be35001ad..26c309eed800 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -34,7 +34,9 @@ using namespace llvm;  using VectorParts = SmallVector<Value *, 2>; +namespace llvm {  extern cl::opt<bool> EnableVPlanNativePath; +}  #define LV_NAME "loop-vectorize"  #define DEBUG_TYPE LV_NAME @@ -50,14 +52,16 @@ bool VPRecipeBase::mayWriteToMemory() const {          ->mayWriteToMemory();    case VPBranchOnMaskSC:    case VPScalarIVStepsSC: +  case VPPredInstPHISC:      return false; -  case VPWidenIntOrFpInductionSC: +  case VPBlendSC: +  case VPReductionSC:    case VPWidenCanonicalIVSC: +  case VPWidenCastSC: +  case VPWidenGEPSC: +  case VPWidenIntOrFpInductionSC:    case VPWidenPHISC: -  case VPBlendSC:    case VPWidenSC: -  case VPWidenGEPSC: -  case VPReductionSC:    case VPWidenSelectSC: {      const Instruction *I =          dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); @@ -82,14 +86,16 @@ bool VPRecipeBase::mayReadFromMemory() const {          ->mayReadFromMemory();    case VPBranchOnMaskSC:    case VPScalarIVStepsSC: +  case VPPredInstPHISC:      return false; -  case VPWidenIntOrFpInductionSC: +  case VPBlendSC: +  case VPReductionSC:    case VPWidenCanonicalIVSC: +  case VPWidenCastSC: +  case VPWidenGEPSC: +  case VPWidenIntOrFpInductionSC:    case VPWidenPHISC: -  case VPBlendSC:    case VPWidenSC: -  case VPWidenGEPSC: -  case VPReductionSC:    case VPWidenSelectSC: {      const Instruction *I =          dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); @@ -108,16 +114,20 @@ bool VPRecipeBase::mayHaveSideEffects() const {    case VPDerivedIVSC:    case VPPredInstPHISC:      return false; -  case VPWidenIntOrFpInductionSC: -  case VPWidenPointerInductionSC: +  case VPWidenCallSC: +    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) +        ->mayHaveSideEffects(); +  case VPBlendSC: +  case VPReductionSC: +  case VPScalarIVStepsSC:    case VPWidenCanonicalIVSC: +  case VPWidenCastSC: +  case VPWidenGEPSC: +  case VPWidenIntOrFpInductionSC:    case VPWidenPHISC: -  case VPBlendSC: +  case VPWidenPointerInductionSC:    case VPWidenSC: -  case VPWidenGEPSC: -  case VPReductionSC: -  case VPWidenSelectSC: -  case VPScalarIVStepsSC: { +  case VPWidenSelectSC: {      const Instruction *I =          dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());      (void)I; @@ -125,6 +135,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {             "underlying instruction has side-effects");      return false;    } +  case VPWidenMemoryInstructionSC: +    assert(cast<VPWidenMemoryInstructionRecipe>(this) +                   ->getIngredient() +                   .mayHaveSideEffects() == mayWriteToMemory() && +           "mayHaveSideffects result for ingredient differs from this " +           "implementation"); +    return mayWriteToMemory();    case VPReplicateSC: {      auto *R = cast<VPReplicateRecipe>(this);      return R->getUnderlyingInstr()->mayHaveSideEffects(); @@ -143,6 +160,16 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {                     State.Builder.GetInsertBlock());  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { +  O << "Live-out "; +  getPhi()->printAsOperand(O); +  O << " = "; +  getOperand(0)->printAsOperand(O, SlotTracker); +  O << "\n"; +} +#endif +  void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {    assert(!Parent && "Recipe already in some VPBasicBlock");    assert(InsertPos->getParent() && @@ -189,55 +216,44 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,    insertBefore(BB, I);  } -void VPInstruction::generateInstruction(VPTransformState &State, -                                        unsigned Part) { +Value *VPInstruction::generateInstruction(VPTransformState &State, +                                          unsigned Part) {    IRBuilderBase &Builder = State.Builder;    Builder.SetCurrentDebugLocation(DL);    if (Instruction::isBinaryOp(getOpcode())) {      Value *A = State.get(getOperand(0), Part);      Value *B = State.get(getOperand(1), Part); -    Value *V = -        Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); -    State.set(this, V, Part); -    return; +    return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);    }    switch (getOpcode()) {    case VPInstruction::Not: {      Value *A = State.get(getOperand(0), Part); -    Value *V = Builder.CreateNot(A, Name); -    State.set(this, V, Part); -    break; +    return Builder.CreateNot(A, Name);    }    case VPInstruction::ICmpULE: {      Value *IV = State.get(getOperand(0), Part);      Value *TC = State.get(getOperand(1), Part); -    Value *V = Builder.CreateICmpULE(IV, TC, Name); -    State.set(this, V, Part); -    break; +    return Builder.CreateICmpULE(IV, TC, Name);    }    case Instruction::Select: {      Value *Cond = State.get(getOperand(0), Part);      Value *Op1 = State.get(getOperand(1), Part);      Value *Op2 = State.get(getOperand(2), Part); -    Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); -    State.set(this, V, Part); -    break; +    return Builder.CreateSelect(Cond, Op1, Op2, Name);    }    case VPInstruction::ActiveLaneMask: {      // Get first lane of vector induction variable.      Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));      // Get the original loop tripcount. -    Value *ScalarTC = State.get(getOperand(1), Part); +    Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));      auto *Int1Ty = Type::getInt1Ty(Builder.getContext());      auto *PredTy = VectorType::get(Int1Ty, State.VF); -    Instruction *Call = Builder.CreateIntrinsic( -        Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, -        {VIVElem0, ScalarTC}, nullptr, Name); -    State.set(this, Call, Part); -    break; +    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, +                                   {PredTy, ScalarTC->getType()}, +                                   {VIVElem0, ScalarTC}, nullptr, Name);    }    case VPInstruction::FirstOrderRecurrenceSplice: {      // Generate code to combine the previous and current values in vector v3. @@ -255,18 +271,22 @@ void VPInstruction::generateInstruction(VPTransformState &State,      // For the first part, use the recurrence phi (v1), otherwise v2.      auto *V1 = State.get(getOperand(0), 0);      Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); -    if (!PartMinus1->getType()->isVectorTy()) { -      State.set(this, PartMinus1, Part); -    } else { -      Value *V2 = State.get(getOperand(1), Part); -      State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), -                Part); -    } -    break; +    if (!PartMinus1->getType()->isVectorTy()) +      return PartMinus1; +    Value *V2 = State.get(getOperand(1), Part); +    return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); +  } +  case VPInstruction::CalculateTripCountMinusVF: { +    Value *ScalarTC = State.get(getOperand(0), {0, 0}); +    Value *Step = +        createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); +    Value *Sub = Builder.CreateSub(ScalarTC, Step); +    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); +    Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); +    return Builder.CreateSelect(Cmp, Sub, Zero);    }    case VPInstruction::CanonicalIVIncrement:    case VPInstruction::CanonicalIVIncrementNUW: { -    Value *Next = nullptr;      if (Part == 0) {        bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;        auto *Phi = State.get(getOperand(0), 0); @@ -274,34 +294,26 @@ void VPInstruction::generateInstruction(VPTransformState &State,        // elements) times the unroll factor (num of SIMD instructions).        Value *Step =            createStepForVF(Builder, Phi->getType(), State.VF, State.UF); -      Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); -    } else { -      Next = State.get(this, 0); +      return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);      } - -    State.set(this, Next, Part); -    break; +    return State.get(this, 0);    }    case VPInstruction::CanonicalIVIncrementForPart:    case VPInstruction::CanonicalIVIncrementForPartNUW: {      bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;      auto *IV = State.get(getOperand(0), VPIteration(0, 0)); -    if (Part == 0) { -      State.set(this, IV, Part); -      break; -    } +    if (Part == 0) +      return IV;      // The canonical IV is incremented by the vectorization factor (num of SIMD      // elements) times the unroll part.      Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); -    Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); -    State.set(this, Next, Part); -    break; +    return Builder.CreateAdd(IV, Step, Name, IsNUW, false);    }    case VPInstruction::BranchOnCond: {      if (Part != 0) -      break; +      return nullptr;      Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));      VPRegionBlock *ParentRegion = getParent()->getParent(); @@ -318,11 +330,11 @@ void VPInstruction::generateInstruction(VPTransformState &State,      CondBr->setSuccessor(0, nullptr);      Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); -    break; +    return CondBr;    }    case VPInstruction::BranchOnCount: {      if (Part != 0) -      break; +      return nullptr;      // First create the compare.      Value *IV = State.get(getOperand(0), Part);      Value *TC = State.get(getOperand(1), Part); @@ -342,7 +354,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,                                                State.CFG.VPBB2IRBB[Header]);      CondBr->setSuccessor(0, nullptr);      Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); -    break; +    return CondBr;    }    default:      llvm_unreachable("Unsupported opcode for instruction"); @@ -353,8 +365,13 @@ void VPInstruction::execute(VPTransformState &State) {    assert(!State.Instance && "VPInstruction executing an Instance");    IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);    State.Builder.setFastMathFlags(FMF); -  for (unsigned Part = 0; Part < State.UF; ++Part) -    generateInstruction(State, Part); +  for (unsigned Part = 0; Part < State.UF; ++Part) { +    Value *GeneratedValue = generateInstruction(State, Part); +    if (!hasResult()) +      continue; +    assert(GeneratedValue && "generateInstruction must produce a value"); +    State.set(this, GeneratedValue, Part); +  }  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -400,6 +417,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,    case VPInstruction::BranchOnCond:      O << "branch-on-cond";      break; +  case VPInstruction::CalculateTripCountMinusVF: +    O << "TC > VF ? TC - VF : 0"; +    break;    case VPInstruction::CanonicalIVIncrementForPart:      O << "VF * Part + ";      break; @@ -438,18 +458,19 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {  }  void VPWidenCallRecipe::execute(VPTransformState &State) { +  assert(State.VF.isVector() && "not widening");    auto &CI = *cast<CallInst>(getUnderlyingInstr());    assert(!isa<DbgInfoIntrinsic>(CI) &&           "DbgInfoIntrinsic should have been dropped during VPlan construction");    State.setDebugLocFromInst(&CI); -  SmallVector<Type *, 4> Tys; -  for (Value *ArgOperand : CI.args()) -    Tys.push_back( -        ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue())); -    for (unsigned Part = 0; Part < State.UF; ++Part) { -    SmallVector<Type *, 2> TysForDecl = {CI.getType()}; +    SmallVector<Type *, 2> TysForDecl; +    // Add return type if intrinsic is overloaded on it. +    if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) { +      TysForDecl.push_back( +          VectorType::get(CI.getType()->getScalarType(), State.VF)); +    }      SmallVector<Value *, 4> Args;      for (const auto &I : enumerate(operands())) {        // Some intrinsics have a scalar argument - don't replace it with a @@ -468,21 +489,16 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {      Function *VectorF;      if (VectorIntrinsicID != Intrinsic::not_intrinsic) {        // Use vector version of the intrinsic. -      if (State.VF.isVector()) -        TysForDecl[0] = -            VectorType::get(CI.getType()->getScalarType(), State.VF);        Module *M = State.Builder.GetInsertBlock()->getModule();        VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);        assert(VectorF && "Can't retrieve vector intrinsic.");      } else { -      // Use vector version of the function call. -      const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/);  #ifndef NDEBUG -      assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && -             "Can't create vector function."); +      assert(Variant != nullptr && "Can't create vector function.");  #endif -      VectorF = VFDatabase(CI).getVectorizedFunction(Shape); +      VectorF = Variant;      } +      SmallVector<OperandBundleDef, 1> OpBundles;      CI.getOperandBundlesAsDefs(OpBundles);      CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); @@ -514,8 +530,12 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,    if (VectorIntrinsicID)      O << " (using vector intrinsic)"; -  else -    O << " (using library function)"; +  else { +    O << " (using library function"; +    if (Variant->hasName()) +      O << ": " << Variant->getName(); +    O << ")"; +  }  }  void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, @@ -528,7 +548,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,    getOperand(1)->printAsOperand(O, SlotTracker);    O << ", ";    getOperand(2)->printAsOperand(O, SlotTracker); -  O << (InvariantCond ? " (condition is loop invariant)" : ""); +  O << (isInvariantCond() ? " (condition is loop invariant)" : "");  }  #endif @@ -541,10 +561,10 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {    // We have to take the 'vectorized' value and pick the first lane.    // Instcombine will make this a no-op.    auto *InvarCond = -      InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; +      isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;    for (unsigned Part = 0; Part < State.UF; ++Part) { -    Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); +    Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);      Value *Op0 = State.get(getOperand(1), Part);      Value *Op1 = State.get(getOperand(2), Part);      Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); @@ -553,6 +573,33 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {    }  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { +  switch (OpType) { +  case OperationType::PossiblyExactOp: +    if (ExactFlags.IsExact) +      O << " exact"; +    break; +  case OperationType::OverflowingBinOp: +    if (WrapFlags.HasNUW) +      O << " nuw"; +    if (WrapFlags.HasNSW) +      O << " nsw"; +    break; +  case OperationType::FPMathOp: +    getFastMathFlags().print(O); +    break; +  case OperationType::GEPOp: +    if (GEPFlags.IsInBounds) +      O << " inbounds"; +    break; +  case OperationType::Other: +    break; +  } +  O << " "; +} +#endif +  void VPWidenRecipe::execute(VPTransformState &State) {    auto &I = *cast<Instruction>(getUnderlyingValue());    auto &Builder = State.Builder; @@ -592,17 +639,8 @@ void VPWidenRecipe::execute(VPTransformState &State) {        Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); -      if (auto *VecOp = dyn_cast<Instruction>(V)) { -        VecOp->copyIRFlags(&I); - -        // If the instruction is vectorized and was in a basic block that needed -        // predication, we can't propagate poison-generating flags (nuw/nsw, -        // exact, etc.). The control flow has been linearized and the -        // instruction is no longer guarded by the predicate, which could make -        // the flag properties to no longer hold. -        if (State.MayGeneratePoisonRecipes.contains(this)) -          VecOp->dropPoisonGeneratingFlags(); -      } +      if (auto *VecOp = dyn_cast<Instruction>(V)) +        setFlags(VecOp);        // Use this vector value for all users of the original instruction.        State.set(this, V, Part); @@ -646,35 +684,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {      break;    } - -  case Instruction::ZExt: -  case Instruction::SExt: -  case Instruction::FPToUI: -  case Instruction::FPToSI: -  case Instruction::FPExt: -  case Instruction::PtrToInt: -  case Instruction::IntToPtr: -  case Instruction::SIToFP: -  case Instruction::UIToFP: -  case Instruction::Trunc: -  case Instruction::FPTrunc: -  case Instruction::BitCast: { -    auto *CI = cast<CastInst>(&I); -    State.setDebugLocFromInst(CI); - -    /// Vectorize casts. -    Type *DestTy = (State.VF.isScalar()) -                       ? CI->getType() -                       : VectorType::get(CI->getType(), State.VF); - -    for (unsigned Part = 0; Part < State.UF; ++Part) { -      Value *A = State.get(getOperand(0), Part); -      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); -      State.set(this, Cast, Part); -      State.addMetadata(Cast, &I); -    } -    break; -  }    default:      // This instruction is not vectorized by simple widening.      LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); @@ -687,10 +696,39 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,    O << Indent << "WIDEN ";    printAsOperand(O, SlotTracker);    const Instruction *UI = getUnderlyingInstr(); -  O << " = " << UI->getOpcodeName() << " "; +  O << " = " << UI->getOpcodeName(); +  printFlags(O);    if (auto *Cmp = dyn_cast<CmpInst>(UI)) -    O << CmpInst::getPredicateName(Cmp->getPredicate()) << " "; +    O << Cmp->getPredicate() << " "; +  printOperands(O, SlotTracker); +} +#endif + +void VPWidenCastRecipe::execute(VPTransformState &State) { +  auto *I = cast_or_null<Instruction>(getUnderlyingValue()); +  if (I) +    State.setDebugLocFromInst(I); +  auto &Builder = State.Builder; +  /// Vectorize casts. +  assert(State.VF.isVector() && "Not vectorizing?"); +  Type *DestTy = VectorType::get(getResultType(), State.VF); + +  for (unsigned Part = 0; Part < State.UF; ++Part) { +    Value *A = State.get(getOperand(0), Part); +    Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); +    State.set(this, Cast, Part); +    State.addMetadata(Cast, I); +  } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, +                              VPSlotTracker &SlotTracker) const { +  O << Indent << "WIDEN-CAST "; +  printAsOperand(O, SlotTracker); +  O << " = " << Instruction::getOpcodeName(Opcode) << " ";    printOperands(O, SlotTracker); +  O << " to " << *getResultType();  }  void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, @@ -710,8 +748,13 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,  #endif  bool VPWidenIntOrFpInductionRecipe::isCanonical() const { +  // The step may be defined by a recipe in the preheader (e.g. if it requires +  // SCEV expansion), but for the canonical induction the step is required to be +  // 1, which is represented as live-in. +  if (getStepValue()->getDefiningRecipe()) +    return false; +  auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());    auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); -  auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());    return StartC && StartC->isZero() && StepC && StepC->isOne();  } @@ -743,6 +786,7 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,  #endif  void VPWidenGEPRecipe::execute(VPTransformState &State) { +  assert(State.VF.isVector() && "not widening");    auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());    // Construct a vector GEP by widening the operands of the scalar GEP as    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -750,7 +794,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {    // is vector-typed. Thus, to keep the representation compact, we only use    // vector-typed operands for loop-varying values. -  if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { +  if (areAllOperandsInvariant()) {      // If we are vectorizing, but the GEP has only loop-invariant operands,      // the GEP we build (by only using vector-typed operands for      // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -763,9 +807,15 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {      //       required. We would add the scalarization decision to      //       collectLoopScalars() and teach getVectorValue() to broadcast      //       the lane-zero scalar value. -    auto *Clone = State.Builder.Insert(GEP->clone()); +    SmallVector<Value *> Ops; +    for (unsigned I = 0, E = getNumOperands(); I != E; I++) +      Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); + +    auto *NewGEP = +        State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], +                                ArrayRef(Ops).drop_front(), "", isInBounds());      for (unsigned Part = 0; Part < State.UF; ++Part) { -      Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); +      Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);        State.set(this, EntryPart, Part);        State.addMetadata(EntryPart, GEP);      } @@ -780,7 +830,7 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {      for (unsigned Part = 0; Part < State.UF; ++Part) {        // The pointer operand of the new GEP. If it's loop-invariant, we        // won't broadcast it. -      auto *Ptr = IsPtrLoopInvariant +      auto *Ptr = isPointerLoopInvariant()                        ? State.get(getOperand(0), VPIteration(0, 0))                        : State.get(getOperand(0), Part); @@ -789,24 +839,16 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {        SmallVector<Value *, 4> Indices;        for (unsigned I = 1, E = getNumOperands(); I < E; I++) {          VPValue *Operand = getOperand(I); -        if (IsIndexLoopInvariant[I - 1]) +        if (isIndexLoopInvariant(I - 1))            Indices.push_back(State.get(Operand, VPIteration(0, 0)));          else            Indices.push_back(State.get(Operand, Part));        } -      // If the GEP instruction is vectorized and was in a basic block that -      // needed predication, we can't propagate the poison-generating 'inbounds' -      // flag. The control flow has been linearized and the GEP is no longer -      // guarded by the predicate, which could make the 'inbounds' properties to -      // no longer hold. -      bool IsInBounds = -          GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; -        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,        // but it should be a vector, otherwise.        auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, -                                             Indices, "", IsInBounds); +                                             Indices, "", isInBounds());        assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&               "NewGEP is not a pointer vector");        State.set(this, NewGEP, Part); @@ -819,14 +861,14 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {  void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,                               VPSlotTracker &SlotTracker) const {    O << Indent << "WIDEN-GEP "; -  O << (IsPtrLoopInvariant ? "Inv" : "Var"); -  size_t IndicesNumber = IsIndexLoopInvariant.size(); -  for (size_t I = 0; I < IndicesNumber; ++I) -    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; +  O << (isPointerLoopInvariant() ? "Inv" : "Var"); +  for (size_t I = 0; I < getNumOperands() - 1; ++I) +    O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";    O << " ";    printAsOperand(O, SlotTracker); -  O << " = getelementptr "; +  O << " = getelementptr"; +  printFlags(O);    printOperands(O, SlotTracker);  }  #endif @@ -911,7 +953,21 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,      O << " (with final reduction value stored in invariant address sank "           "outside of loop)";  } +#endif + +bool VPReplicateRecipe::shouldPack() const { +  // Find if the recipe is used by a widened recipe via an intervening +  // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. +  return any_of(users(), [](const VPUser *U) { +    if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) +      return any_of(PredR->users(), [PredR](const VPUser *U) { +        return !U->usesScalars(PredR); +      }); +    return false; +  }); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,                                VPSlotTracker &SlotTracker) const {    O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); @@ -921,18 +977,21 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,      O << " = ";    }    if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { -    O << "call @" << CB->getCalledFunction()->getName() << "("; +    O << "call"; +    printFlags(O); +    O << "@" << CB->getCalledFunction()->getName() << "(";      interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),                      O, [&O, &SlotTracker](VPValue *Op) {                        Op->printAsOperand(O, SlotTracker);                      });      O << ")";    } else { -    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; +    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); +    printFlags(O);      printOperands(O, SlotTracker);    } -  if (AlsoPack) +  if (shouldPack())      O << " (S->V)";  }  #endif @@ -1053,20 +1112,22 @@ void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,  }  #endif -bool VPCanonicalIVPHIRecipe::isCanonical(const InductionDescriptor &ID, -                                         Type *Ty) const { -  if (Ty != getScalarType()) +bool VPCanonicalIVPHIRecipe::isCanonical( +    InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, +    Type *Ty) const { +  // The types must match and it must be an integer induction. +  if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)      return false; -  // The start value of ID must match the start value of this canonical -  // induction. -  if (getStartValue()->getLiveInIRValue() != ID.getStartValue()) +  // Start must match the start value of this canonical induction. +  if (Start != getStartValue())      return false; -  ConstantInt *Step = ID.getConstIntStepValue(); -  // ID must also be incremented by one. IK_IntInduction always increment the -  // induction by Step, but the binary op may not be set. -  return ID.getKind() == InductionDescriptor::IK_IntInduction && Step && -         Step->isOne(); +  // If the step is defined by a recipe, it is not a ConstantInt. +  if (Step->getDefiningRecipe()) +    return false; + +  ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); +  return StepC && StepC->isOne();  }  bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { @@ -1092,9 +1153,11 @@ void VPExpandSCEVRecipe::execute(VPTransformState &State) {    Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),                                   &*State.Builder.GetInsertPoint()); - +  assert(!State.ExpandedSCEVs.contains(Expr) && +         "Same SCEV expanded multiple times"); +  State.ExpandedSCEVs[Expr] = Res;    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) -    State.set(this, Res, Part); +    State.set(this, Res, {Part, 0});  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  | 
