diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-02-05 20:07:43 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-05-14 11:44:47 +0000 |
| commit | 1fd87a682ad7442327078e1eeb63edc4258f9815 (patch) | |
| tree | 83b42223e987ef7df2e1036937bc1bb627fa2779 /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
| parent | 04eeddc0aa8e0a417a16eaf9d7d095207f4a8623 (diff) | |
| parent | ecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
7 files changed, 354 insertions, 121 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d11f4146b590..3290439ecd07 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -632,13 +632,6 @@ protected: Instruction *EntryVal, VPValue *Def, VPTransformState &State); - /// Returns true if an instruction \p I should be scalarized instead of - /// vectorized for the chosen vectorization factor. - bool shouldScalarizeInstruction(Instruction *I) const; - - /// Returns true if we should generate a scalar version of \p IV. - bool needsScalarInduction(Instruction *IV) const; - /// Returns (and creates if needed) the original loop trip count. Value *getOrCreateTripCount(Loop *NewLoop); @@ -2479,21 +2472,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( VecInd->addIncoming(LastInduction, LoopVectorLatch); } -bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Cost->isScalarAfterVectorization(I, VF) || - Cost->isProfitableToScalarize(I, VF); -} - -bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { - if (shouldScalarizeInstruction(IV)) - return true; - auto isScalarInst = [&](User *U) -> bool { - auto *I = cast<Instruction>(U); - return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); - }; - return llvm::any_of(IV->users(), isScalarInst); -} - void InnerLoopVectorizer::widenIntOrFpInduction( PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, Value *CanonicalIV) { @@ -2549,27 +2527,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction( return ScalarIV; }; - // Create the vector values from the scalar IV, in the absence of creating a - // vector IV. - auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { - Value *Broadcasted = getBroadcastInstrs(ScalarIV); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx; - if (Step->getType()->isFloatingPointTy()) - StartIdx = - getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); - else - StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); - - Value *EntryPart = - getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), - State.VF, State.Builder); - State.set(Def, EntryPart, Part); - if (Trunc) - addMetadata(EntryPart, Trunc); - } - }; - // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(Builder); if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) @@ -2605,36 +2562,18 @@ void InnerLoopVectorizer::widenIntOrFpInduction( return; } - // Determine if we want a scalar version of the induction variable. This is - // true if the induction variable itself is not widened, or if it has at - // least one user in the loop that is not widened. - auto NeedsScalarIV = needsScalarInduction(EntryVal); - if (!NeedsScalarIV) { + // Create a new independent vector induction variable, if one is needed. + if (Def->needsVectorIV()) createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - return; - } - // Try to create a new independent vector induction variable. If we can't - // create the phi node, we will splat the scalar induction variable in each - // loop iteration. - if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); - Value *ScalarIV = CreateScalarIV(Step); + if (Def->needsScalarIV()) { // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. + Value *ScalarIV = CreateScalarIV(Step); buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); - return; } - - // All IV users are scalar instructions, so only emit a scalar IV, not a - // vectorised IV. Except when we tail-fold, then the splat IV feeds the - // predicate used by the masked loads/stores. - Value *ScalarIV = CreateScalarIV(Step); - if (!Cost->isScalarEpilogueAllowed()) - CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -2663,17 +2602,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, } // Determine the number of scalars we need to generate for each unroll - // iteration. If EntryVal is uniform, we only need to generate the first - // lane. Otherwise, we generate all VF values. - bool IsUniform = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); + // iteration. + bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); + unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!IsUniform && State.VF.isScalable()) { + if (!FirstLaneOnly && State.VF.isScalable()) { VecIVTy = VectorType::get(ScalarIVTy, State.VF); UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); @@ -2684,7 +2621,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, for (unsigned Part = 0; Part < State.UF; ++Part) { Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - if (!IsUniform && State.VF.isScalable()) { + if (!FirstLaneOnly && State.VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) @@ -4565,7 +4502,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); + bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); assert((IsUniform || !State.VF.isScalable()) && "Cannot scalarize a scalable VF"); unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); @@ -5889,7 +5826,9 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) return false; - if (VF.getFixedValue() >= EpilogueVectorizationMinVF) + // FIXME: We should consider changing the threshold for scalable + // vectors to take VScaleForTuning into account. + if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) return true; return false; } @@ -5940,29 +5879,21 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( return Result; } - auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (MainLoopVF.isScalable()) - LLVM_DEBUG( - dbgs() << "LEV: Epilogue vectorization using scalable vectors not " - "yet supported. Converting to fixed-width (VF=" - << FixedMainLoopVF << ") instead\n"); - - if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + if (!isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && - (Result.Width.getFixedValue() == 1 || - isMoreProfitable(NextVF, Result)) && + if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width.getFixedValue() << "\n";); + << Result.Width << "\n";); return Result; } @@ -8546,16 +8477,54 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Mask, Consecutive, Reverse); } -VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, - ArrayRef<VPValue *> Operands) const { +static VPWidenIntOrFpInductionRecipe * +createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, + VPValue *Start, const InductionDescriptor &IndDesc, + LoopVectorizationCostModel &CM, Loop &OrigLoop, + VFRange &Range) { + // Returns true if an instruction \p I should be scalarized instead of + // vectorized for the chosen vectorization factor. + auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { + return CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF); + }; + + bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + // Returns true if we should generate a scalar version of \p IV. + if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) + return true; + auto isScalarInst = [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); + }; + return any_of(PhiOrTrunc->users(), isScalarInst); + }, + Range); + bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + return ShouldScalarizeInstruction(PhiOrTrunc, VF); + }, + Range); + assert(IndDesc.getStartValue() == + Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); + if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { + return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, + NeedsScalarIV, !NeedsScalarIVOnly); + } + assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, + !NeedsScalarIVOnly); +} + +VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { + // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. - if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { - assert(II->getStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); - } + if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) + return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, + Range); return nullptr; } @@ -8583,7 +8552,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); + return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); } return nullptr; } @@ -8865,7 +8834,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 99c265fc5101..15b349f53fd9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -471,17 +471,36 @@ static bool isValidForAlternation(unsigned Opcode) { return true; } +static InstructionsState getSameOpcode(ArrayRef<Value *> VL, + unsigned BaseIndex = 0); + +/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. +/// compatible instructions or constants, or just some other regular values. +static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, + Value *Op1) { + return (isConstant(BaseOp0) && isConstant(Op0)) || + (isConstant(BaseOp1) && isConstant(Op1)) || + (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && + !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || + getSameOpcode({BaseOp0, Op0}).getOpcode() || + getSameOpcode({BaseOp1, Op1}).getOpcode(); +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef<Value *> VL, - unsigned BaseIndex = 0) { + unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); bool IsCastOp = isa<CastInst>(VL[BaseIndex]); bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); + bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]); + CmpInst::Predicate BasePred = + IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate() + : CmpInst::BAD_ICMP_PREDICATE; unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; @@ -514,6 +533,57 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, continue; } } + } else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) { + auto *BaseInst = cast<Instruction>(VL[BaseIndex]); + auto *Inst = cast<Instruction>(VL[Cnt]); + Type *Ty0 = BaseInst->getOperand(0)->getType(); + Type *Ty1 = Inst->getOperand(0)->getType(); + if (Ty0 == Ty1) { + Value *BaseOp0 = BaseInst->getOperand(0); + Value *BaseOp1 = BaseInst->getOperand(1); + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + CmpInst::Predicate CurrentPred = + cast<CmpInst>(VL[Cnt])->getPredicate(); + CmpInst::Predicate SwappedCurrentPred = + CmpInst::getSwappedPredicate(CurrentPred); + // Check for compatible operands. If the corresponding operands are not + // compatible - need to perform alternate vectorization. + if (InstOpcode == Opcode) { + if (BasePred == CurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) + continue; + if (BasePred == SwappedCurrentPred && + areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)) + continue; + if (E == 2 && + (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) + continue; + auto *AltInst = cast<CmpInst>(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + Value *AltOp0 = AltInst->getOperand(0); + Value *AltOp1 = AltInst->getOperand(1); + // Check if operands are compatible with alternate operands. + if (AltPred == CurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op0, Op1)) + continue; + if (AltPred == SwappedCurrentPred && + areCompatibleCmpOps(AltOp0, AltOp1, Op1, Op0)) + continue; + } + if (BaseIndex == AltIndex) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); + AltIndex = Cnt; + continue; + } + auto *AltInst = cast<CmpInst>(VL[AltIndex]); + CmpInst::Predicate AltPred = AltInst->getPredicate(); + if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || + AltPred == CurrentPred || AltPred == SwappedCurrentPred) + continue; + } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; return InstructionsState(VL[BaseIndex], nullptr, nullptr); @@ -3307,9 +3377,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { MapVector<OrdersType, unsigned, DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> OrdersUses; + // Do the analysis for each tree entry only once, otherwise the order of + // the same node my be considered several times, though might be not + // profitable. SmallPtrSet<const TreeEntry *, 4> VisitedOps; for (const auto &Op : Data.second) { TreeEntry *OpTE = Op.second; + if (!VisitedOps.insert(OpTE).second) + continue; if (!OpTE->ReuseShuffleIndices.empty() || (IgnoreReorder && OpTE == VectorizableTree.front().get())) continue; @@ -3333,9 +3408,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } else { ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } - if (VisitedOps.insert(OpTE).second) - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += - OpTE->UserTreeIndices.size(); + OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + OpTE->UserTreeIndices.size(); assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); --OrdersUses[{}]; } @@ -4350,9 +4424,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. - if (isa<BinaryOperator>(VL0)) { + auto *CI = dyn_cast<CmpInst>(VL0); + if (isa<BinaryOperator>(VL0) || CI) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + if (!CI || all_of(VL, [](Value *V) { + return cast<CmpInst>(V)->isCommutative(); + })) { + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + } else { + CmpInst::Predicate P0 = CI->getPredicate(); + CmpInst::Predicate AltP0 = cast<CmpInst>(S.AltOp)->getPredicate(); + CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0); + Value *BaseOp0 = VL0->getOperand(0); + Value *BaseOp1 = VL0->getOperand(1); + // Collect operands - commute if it uses the swapped predicate or + // alternate operation. + for (Value *V : VL) { + auto *Cmp = cast<CmpInst>(V); + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + CmpInst::Predicate CurrentPredSwapped = + CmpInst::getSwappedPredicate(CurrentPred); + if (P0 == AltP0 || P0 == AltP0Swapped) { + if ((P0 == CurrentPred && + !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) || + (P0 == CurrentPredSwapped && + !areCompatibleCmpOps(BaseOp0, BaseOp1, RHS, LHS))) + std::swap(LHS, RHS); + } else if (!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) { + std::swap(LHS, RHS); + } + Left.push_back(LHS); + Right.push_back(RHS); + } + } TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -5284,7 +5390,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); InstructionCost ScalarCost = 0; if (NeedToShuffleReuses) { @@ -5332,6 +5439,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { + VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, + Builder.getInt1Ty(), + CI0->getPredicate(), CostKind, VL0); + VecCost += TTI->getCmpSelInstrCost( + E->getOpcode(), ScalarTy, Builder.getInt1Ty(), + cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, + E->getAltOp()); } else { Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); @@ -5348,6 +5463,29 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) { + auto *AltCI0 = cast<CmpInst>(E->getAltOp()); + auto *CI = cast<CmpInst>(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + CmpInst::Predicate AltP0Swapped = + CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + CmpInst::Predicate CurrentPredSwapped = + CmpInst::getSwappedPredicate(CurrentPred); + if (P0 == AltP0 || P0 == AltP0Swapped) { + // Alternate cmps have same/swapped predicate as main cmps but + // different order of compatible operands. + return !( + (P0 == CurrentPred && + areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + I->getOperand(0), I->getOperand(1))) || + (P0 == CurrentPredSwapped && + areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + I->getOperand(1), I->getOperand(0)))); + } + return CurrentPred != P0 && CurrentPredSwapped != P0; + } return I->getOpcode() == E->getAltOpcode(); }, Mask); @@ -6830,11 +6968,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && - Instruction::isCast(E->getAltOpcode()))) && + Instruction::isCast(E->getAltOpcode())) || + (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); Value *LHS = nullptr, *RHS = nullptr; - if (Instruction::isBinaryOp(E->getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); @@ -6854,6 +6993,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); + } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { + V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); + auto *AltCI = cast<CmpInst>(E->getAltOp()); + CmpInst::Predicate AltPred = AltCI->getPredicate(); + unsigned AltIdx = + std::distance(E->Scalars.begin(), find(E->Scalars, AltCI)); + if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx]) + AltPred = CmpInst::getSwappedPredicate(AltPred); + V1 = Builder.CreateCmp(AltPred, LHS, RHS); } else { V0 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); @@ -6878,6 +7026,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, [E](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) { + auto *AltCI0 = cast<CmpInst>(E->getAltOp()); + auto *CI = cast<CmpInst>(I); + CmpInst::Predicate P0 = CI0->getPredicate(); + CmpInst::Predicate AltP0 = AltCI0->getPredicate(); + CmpInst::Predicate AltP0Swapped = + CmpInst::getSwappedPredicate(AltP0); + CmpInst::Predicate CurrentPred = CI->getPredicate(); + CmpInst::Predicate CurrentPredSwapped = + CmpInst::getSwappedPredicate(CurrentPred); + if (P0 == AltP0 || P0 == AltP0Swapped) { + // Alternate cmps have same/swapped predicate as main cmps but + // different order of compatible operands. + return !( + (P0 == CurrentPred && + areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + I->getOperand(0), I->getOperand(1))) || + (P0 == CurrentPredSwapped && + areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1), + I->getOperand(1), I->getOperand(0)))); + } + return CurrentPred != P0 && CurrentPredSwapped != P0; + } return I->getOpcode() == E->getAltOpcode(); }, Mask, &OpScalars, &AltScalars); @@ -7676,11 +7847,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (ScheduleData *BundleMember = picked; BundleMember; BundleMember = BundleMember->NextInBundle) { Instruction *pickedInst = BundleMember->Inst; - if (pickedInst->getNextNode() != LastScheduledInst) { - BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); - } + if (pickedInst->getNextNode() != LastScheduledInst) + pickedInst->moveBefore(LastScheduledInst); LastScheduledInst = pickedInst; } @@ -8444,7 +8612,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (R.isTreeTinyAndNotFullyVectorizable()) continue; R.reorderTopToBottom(); - R.reorderBottomToTop(); + R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front())); R.buildExternalUses(); R.computeMinimumValueSizes(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index e5dded3c0f1e..8822c0004eb2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -75,7 +75,8 @@ class VPRecipeBuilder { /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands) const; + tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands, + VFRange &Range) const; /// Optimize the special case where the operand of \p I is a constant integer /// induction variable. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index a96c122db2a9..342d4a074e10 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1649,3 +1649,9 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { for (VPValue *Def : Recipe.definedValues()) assignSlot(Def); } + +bool vputils::onlyFirstLaneUsed(VPValue *Def) { + return all_of(Def->users(), [Def](VPUser *U) { + return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def); + }); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 824440f98a8b..bcaabca692cc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -759,6 +759,14 @@ public: bool mayReadOrWriteMemory() const { return mayReadFromMemory() || mayWriteToMemory(); } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + /// Conservatively returns false. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return false; + } }; inline bool VPUser::classof(const VPDef *Def) { @@ -893,6 +901,24 @@ public: /// Set the fast-math flags. void setFastMathFlags(FastMathFlags FMFNew); + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + if (getOperand(0) != Op) + return false; + switch (getOpcode()) { + default: + return false; + case VPInstruction::ActiveLaneMask: + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::BranchOnCount: + return true; + }; + llvm_unreachable("switch should return"); + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type its @@ -1027,18 +1053,24 @@ public: class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; const InductionDescriptor &IndDesc; + bool NeedsScalarIV; + bool NeedsVectorIV; public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, - const InductionDescriptor &IndDesc) + const InductionDescriptor &IndDesc, + bool NeedsScalarIV, bool NeedsVectorIV) : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), - IV(IV), IndDesc(IndDesc) {} + IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV), + NeedsVectorIV(NeedsVectorIV) {} VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, const InductionDescriptor &IndDesc, - TruncInst *Trunc) + TruncInst *Trunc, bool NeedsScalarIV, + bool NeedsVectorIV) : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), - IV(IV), IndDesc(IndDesc) {} + IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV), + NeedsVectorIV(NeedsVectorIV) {} ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1082,6 +1114,12 @@ public: const TruncInst *TruncI = getTruncInst(); return TruncI ? TruncI->getType() : IV->getType(); } + + /// Returns true if a scalar phi needs to be created for the induction. + bool needsScalarIV() const { return NeedsScalarIV; } + + /// Returns true if a vector phi needs to be created for the induction. + bool needsVectorIV() const { return NeedsVectorIV; } }; /// A pure virtual base class for all recipes modeling header phis, including @@ -1318,6 +1356,17 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // Recursing through Blend recipes only, must terminate at header phi's the + // latest. + return all_of(users(), [this](VPUser *U) { + return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(this); + }); + } }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load @@ -1495,6 +1544,13 @@ public: bool isPacked() const { return AlsoPack; } bool isPredicated() const { return IsPredicated; } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return isUniform(); + } }; /// A recipe for generating conditional branches on the bits of a mask. @@ -1651,6 +1707,16 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + + // Widened, consecutive memory operations only demand the first lane of + // their address. + return Op == getAddr() && isConsecutive(); + } }; /// Canonical scalar induction phi of the vector loop. Starting at the specified @@ -1686,6 +1752,13 @@ public: const Type *getScalarType() const { return getOperand(0)->getLiveInIRValue()->getType(); } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// A Recipe for widening the canonical induction variable of the vector loop. @@ -2766,6 +2839,14 @@ public: /// Return true if all visited instruction can be combined. bool isCompletelySLP() const { return CompletelySLP; } }; + +namespace vputils { + +/// Returns true if only the first lane of \p Def is used. +bool onlyFirstLaneUsed(VPValue *Def); + +} // end namespace vputils + } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fb5f3d428189..70ce773a8a85 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -47,7 +47,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II); + NewRecipe = + new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -341,10 +342,16 @@ void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { for (VPRecipeBase &Phi : HeaderVPBB->phis()) { auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); - // If the induction recipe is canonical and the types match, use it - // directly. - if (WidenOriginalIV && WidenOriginalIV->isCanonical() && - WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) { + if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() || + WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType()) + continue; + + // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides + // everything WidenNewIV's users need. That is, WidenOriginalIV will + // generate a vector phi or all users of WidenNewIV demand the first lane + // only. + if (WidenOriginalIV->needsVectorIV() || + vputils::onlyFirstLaneUsed(WidenNewIV)) { WidenNewIV->replaceAllUsesWith(WidenOriginalIV); WidenNewIV->eraseFromParent(); return; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp index 0296a995ad29..010ca28fc237 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" using namespace llvm; |
