diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
30 files changed, 794 insertions, 599 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 529f7309a1a2..89a1ad2243c8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -2953,6 +2953,9 @@ void coro::salvageDebugInfo( std::optional<BasicBlock::iterator> InsertPt; if (auto *I = dyn_cast<Instruction>(Storage)) { InsertPt = I->getInsertionPointAfterDef(); + // Update DILocation only in O0 since it is easy to get out of sync in + // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for + // an example. if (!OptimizeFrame && I->getDebugLoc()) DVI.setDebugLoc(I->getDebugLoc()); } else if (isa<Argument>(Storage)) @@ -2988,9 +2991,14 @@ void coro::salvageDebugInfo( // dbg.declare does. if (DPV.getType() == DPValue::LocationType::Declare) { std::optional<BasicBlock::iterator> InsertPt; - if (auto *I = dyn_cast<Instruction>(Storage)) + if (auto *I = dyn_cast<Instruction>(Storage)) { InsertPt = I->getInsertionPointAfterDef(); - else if (isa<Argument>(Storage)) + // Update DILocation only in O0 since it is easy to get out of sync in + // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for + // an example. + if (!OptimizeFrame && I->getDebugLoc()) + DPV.setDebugLoc(I->getDebugLoc()); + } else if (isa<Argument>(Storage)) InsertPt = F->getEntryBlock().begin(); if (InsertPt) { DPV.removeFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index b2618e35b085..cc5a4ee8c2bd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -6725,10 +6725,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack { LLVMContext &Ctx = AI.CB->getContext(); ObjectSizeOpts Opts; ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts); - SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB); + SizeOffsetValue SizeOffsetPair = Eval.compute(AI.CB); assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() && - cast<ConstantInt>(SizeOffsetPair.second)->isZero()); - Size = SizeOffsetPair.first; + cast<ConstantInt>(SizeOffsetPair.Offset)->isZero()); + Size = SizeOffsetPair.Size; } Instruction *IP = diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 556fde37efeb..96b612254ca5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1666,13 +1666,6 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { if (Instruction *Ashr = foldAddToAshr(I)) return Ashr; - // min(A, B) + max(A, B) => A + B. - if (match(&I, m_CombineOr(m_c_Add(m_SMax(m_Value(A), m_Value(B)), - m_c_SMin(m_Deferred(A), m_Deferred(B))), - m_c_Add(m_UMax(m_Value(A), m_Value(B)), - m_c_UMin(m_Deferred(A), m_Deferred(B)))))) - return BinaryOperator::CreateWithCopiedFlags(Instruction::Add, A, B, &I); - // (~X) + (~Y) --> -2 - (X + Y) { // To ensure we can save instructions we need to ensure that we consume both diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 43d4496571be..40b48699f758 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -172,10 +172,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { // If the memcpy has metadata describing the members, see if we can get the // TBAA tag describing our copy. - MDNode *CopyMD = nullptr; - if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { - CopyMD = M; - } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { + AAMDNodes AACopyMD = MI->getAAMetadata(); + + if (MDNode *M = AACopyMD.TBAAStruct) { + AACopyMD.TBAAStruct = nullptr; if (M->getNumOperands() == 3 && M->getOperand(0) && mdconst::hasa<ConstantInt>(M->getOperand(0)) && mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && @@ -184,7 +184,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == Size && M->getOperand(2) && isa<MDNode>(M->getOperand(2))) - CopyMD = cast<MDNode>(M->getOperand(2)); + AACopyMD.TBAA = cast<MDNode>(M->getOperand(2)); } Value *Src = MI->getArgOperand(1); @@ -192,8 +192,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { LoadInst *L = Builder.CreateLoad(IntType, Src); // Alignment from the mem intrinsic will be better, so use it. L->setAlignment(*CopySrcAlign); - if (CopyMD) - L->setMetadata(LLVMContext::MD_tbaa, CopyMD); + L->setAAMetadata(AACopyMD); MDNode *LoopMemParallelMD = MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); if (LoopMemParallelMD) @@ -205,8 +204,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. S->setAlignment(*CopyDstAlign); - if (CopyMD) - S->setMetadata(LLVMContext::MD_tbaa, CopyMD); + S->setAAMetadata(AACopyMD); if (LoopMemParallelMD) S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); if (AccessGroupMD) @@ -1536,11 +1534,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } if (II->isCommutative()) { - if (Instruction *I = foldCommutativeIntrinsicOverSelects(*II)) - return I; - - if (Instruction *I = foldCommutativeIntrinsicOverPhis(*II)) - return I; + if (auto Pair = matchSymmetricPair(II->getOperand(0), II->getOperand(1))) { + replaceOperand(*II, 0, Pair->first); + replaceOperand(*II, 1, Pair->second); + return II; + } if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI)) return NewCall; @@ -4246,39 +4244,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call, Call.setCalledFunction(FTy, NestF); return &Call; } - -// op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y) -Instruction * -InstCombinerImpl::foldCommutativeIntrinsicOverSelects(IntrinsicInst &II) { - assert(II.isCommutative()); - - Value *A, *B, *C; - if (match(II.getOperand(0), m_Select(m_Value(A), m_Value(B), m_Value(C))) && - match(II.getOperand(1), - m_Select(m_Specific(A), m_Specific(C), m_Specific(B)))) { - replaceOperand(II, 0, B); - replaceOperand(II, 1, C); - return &II; - } - - return nullptr; -} - -Instruction * -InstCombinerImpl::foldCommutativeIntrinsicOverPhis(IntrinsicInst &II) { - assert(II.isCommutative() && "Instruction should be commutative"); - - PHINode *LHS = dyn_cast<PHINode>(II.getOperand(0)); - PHINode *RHS = dyn_cast<PHINode>(II.getOperand(1)); - - if (!LHS || !RHS) - return nullptr; - - if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) { - replaceOperand(II, 0, P->first); - replaceOperand(II, 1, P->second); - return &II; - } - - return nullptr; -} diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 3875e59c3ede..7c1aff445524 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4920,8 +4920,9 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, } } - if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() && - BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) { + if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && + (BO0->hasOneUse() || BO1->hasOneUse()) && + BO0->getOperand(1) == BO1->getOperand(1)) { switch (BO0->getOpcode()) { default: break; @@ -5047,8 +5048,16 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I, Value *Y = MinMax->getRHS(); if (ICmpInst::isSigned(Pred) && !MinMax->isSigned()) return nullptr; - if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned()) - return nullptr; + if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned()) { + // Revert the transform signed pred -> unsigned pred + // TODO: We can flip the signedness of predicate if both operands of icmp + // are negative. + if (isKnownNonNegative(Z, SQ.getWithInstruction(&I)) && + isKnownNonNegative(MinMax, SQ.getWithInstruction(&I))) { + Pred = ICmpInst::getFlippedSignednessPredicate(Pred); + } else + return nullptr; + } SimplifyQuery Q = SQ.getWithInstruction(&I); auto IsCondKnownTrue = [](Value *Val) -> std::optional<bool> { if (!Val) @@ -6860,6 +6869,57 @@ Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred, return foldICmpAddOpConst(X, *C, Pred); } + // abs(X) >= X --> true + // abs(X) u<= X --> true + // abs(X) < X --> false + // abs(X) u> X --> false + // abs(X) u>= X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN` + // abs(X) <= X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN` + // abs(X) == X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN` + // abs(X) u< X --> IsIntMinPosion ? `X < 0` : `X > INTMIN` + // abs(X) > X --> IsIntMinPosion ? `X < 0` : `X > INTMIN` + // abs(X) != X --> IsIntMinPosion ? `X < 0` : `X > INTMIN` + { + Value *X; + Constant *C; + if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X), m_Constant(C))) && + match(Op1, m_Specific(X))) { + Value *NullValue = Constant::getNullValue(X->getType()); + Value *AllOnesValue = Constant::getAllOnesValue(X->getType()); + const APInt SMin = + APInt::getSignedMinValue(X->getType()->getScalarSizeInBits()); + bool IsIntMinPosion = C->isAllOnesValue(); + switch (Pred) { + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SGE: + return replaceInstUsesWith(CxtI, ConstantInt::getTrue(CxtI.getType())); + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SLT: + return replaceInstUsesWith(CxtI, ConstantInt::getFalse(CxtI.getType())); + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_EQ: { + return replaceInstUsesWith( + CxtI, IsIntMinPosion + ? Builder.CreateICmpSGT(X, AllOnesValue) + : Builder.CreateICmpULT( + X, ConstantInt::get(X->getType(), SMin + 1))); + } + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_NE: { + return replaceInstUsesWith( + CxtI, IsIntMinPosion + ? Builder.CreateICmpSLT(X, NullValue) + : Builder.CreateICmpUGT( + X, ConstantInt::get(X->getType(), SMin))); + } + default: + llvm_unreachable("Invalid predicate!"); + } + } + } + return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index bdaf7550b4b4..21c61bd99018 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -276,17 +276,15 @@ private: bool transformConstExprCastCall(CallBase &Call); Instruction *transformCallThroughTrampoline(CallBase &Call, IntrinsicInst &Tramp); - Instruction *foldCommutativeIntrinsicOverSelects(IntrinsicInst &II); - // Match a pair of Phi Nodes like - // phi [a, BB0], [b, BB1] & phi [b, BB0], [a, BB1] - // Return the matched two operands. - std::optional<std::pair<Value *, Value *>> - matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS); - - // Tries to fold (op phi(a, b) phi(b, a)) -> (op a, b) - // while op is a commutative intrinsic call. - Instruction *foldCommutativeIntrinsicOverPhis(IntrinsicInst &II); + // Return (a, b) if (LHS, RHS) is known to be (a, b) or (b, a). + // Otherwise, return std::nullopt + // Currently it matches: + // - LHS = (select c, a, b), RHS = (select c, b, a) + // - LHS = (phi [a, BB0], [b, BB1]), RHS = (phi [b, BB0], [a, BB1]) + // - LHS = min(a, b), RHS = max(a, b) + std::optional<std::pair<Value *, Value *>> matchSymmetricPair(Value *LHS, + Value *RHS); Value *simplifyMaskedLoad(IntrinsicInst &II); Instruction *simplifyMaskedStore(IntrinsicInst &II); @@ -502,11 +500,6 @@ public: /// X % (C0 * C1) Value *SimplifyAddWithRemainder(BinaryOperator &I); - // Tries to fold (Binop phi(a, b) phi(b, a)) -> (Binop a, b) - // while Binop is commutative. - Value *SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, Value *LHS, - Value *RHS); - // Binary Op helper for select operations where the expression can be // efficiently reorganized. Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS, diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index f0ea3d9fcad5..e7f983a00e30 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -487,13 +487,6 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I)) return Res; - // min(X, Y) * max(X, Y) => X * Y. - if (match(&I, m_CombineOr(m_c_Mul(m_SMax(m_Value(X), m_Value(Y)), - m_c_SMin(m_Deferred(X), m_Deferred(Y))), - m_c_Mul(m_UMax(m_Value(X), m_Value(Y)), - m_c_UMin(m_Deferred(X), m_Deferred(Y)))))) - return BinaryOperator::CreateWithCopiedFlags(Instruction::Mul, X, Y, &I); - // (mul Op0 Op1): // if Log2(Op0) folds away -> // (shl Op1, Log2(Op0)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 351fc3b0174f..7f2018b3a199 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -411,6 +411,14 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { getComplexity(I.getOperand(1))) Changed = !I.swapOperands(); + if (I.isCommutative()) { + if (auto Pair = matchSymmetricPair(I.getOperand(0), I.getOperand(1))) { + replaceOperand(I, 0, Pair->first); + replaceOperand(I, 1, Pair->second); + Changed = true; + } + } + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0)); BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1)); @@ -1096,8 +1104,8 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) { return SimplifySelectsFeedingBinaryOp(I, LHS, RHS); } -std::optional<std::pair<Value *, Value *>> -InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) { +static std::optional<std::pair<Value *, Value *>> +matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) { if (LHS->getParent() != RHS->getParent()) return std::nullopt; @@ -1123,25 +1131,41 @@ InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) { return std::optional(std::pair(L0, R0)); } -Value *InstCombinerImpl::SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, - Value *Op0, - Value *Op1) { - assert(I.isCommutative() && "Instruction should be commutative"); - - PHINode *LHS = dyn_cast<PHINode>(Op0); - PHINode *RHS = dyn_cast<PHINode>(Op1); - - if (!LHS || !RHS) - return nullptr; - - if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) { - Value *BI = Builder.CreateBinOp(I.getOpcode(), P->first, P->second); - if (auto *BO = dyn_cast<BinaryOperator>(BI)) - BO->copyIRFlags(&I); - return BI; +std::optional<std::pair<Value *, Value *>> +InstCombinerImpl::matchSymmetricPair(Value *LHS, Value *RHS) { + Instruction *LHSInst = dyn_cast<Instruction>(LHS); + Instruction *RHSInst = dyn_cast<Instruction>(RHS); + if (!LHSInst || !RHSInst || LHSInst->getOpcode() != RHSInst->getOpcode()) + return std::nullopt; + switch (LHSInst->getOpcode()) { + case Instruction::PHI: + return matchSymmetricPhiNodesPair(cast<PHINode>(LHS), cast<PHINode>(RHS)); + case Instruction::Select: { + Value *Cond = LHSInst->getOperand(0); + Value *TrueVal = LHSInst->getOperand(1); + Value *FalseVal = LHSInst->getOperand(2); + if (Cond == RHSInst->getOperand(0) && TrueVal == RHSInst->getOperand(2) && + FalseVal == RHSInst->getOperand(1)) + return std::pair(TrueVal, FalseVal); + return std::nullopt; + } + case Instruction::Call: { + // Match min(a, b) and max(a, b) + MinMaxIntrinsic *LHSMinMax = dyn_cast<MinMaxIntrinsic>(LHSInst); + MinMaxIntrinsic *RHSMinMax = dyn_cast<MinMaxIntrinsic>(RHSInst); + if (LHSMinMax && RHSMinMax && + LHSMinMax->getPredicate() == + ICmpInst::getSwappedPredicate(RHSMinMax->getPredicate()) && + ((LHSMinMax->getLHS() == RHSMinMax->getLHS() && + LHSMinMax->getRHS() == RHSMinMax->getRHS()) || + (LHSMinMax->getLHS() == RHSMinMax->getRHS() && + LHSMinMax->getRHS() == RHSMinMax->getLHS()))) + return std::pair(LHSMinMax->getLHS(), LHSMinMax->getRHS()); + return std::nullopt; + } + default: + return std::nullopt; } - - return nullptr; } Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, @@ -1187,14 +1211,6 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, }; if (LHSIsSelect && RHSIsSelect && A == D) { - // op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y) - if (I.isCommutative() && B == F && C == E) { - Value *BI = Builder.CreateBinOp(I.getOpcode(), B, E); - if (auto *BO = dyn_cast<BinaryOperator>(BI)) - BO->copyIRFlags(&I); - return BI; - } - // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F) Cond = A; True = simplifyBinOp(Opcode, B, E, FMF, Q); @@ -1577,11 +1593,6 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { BO.getParent() != Phi1->getParent()) return nullptr; - if (BO.isCommutative()) { - if (Value *V = SimplifyPhiCommutativeBinaryOp(BO, Phi0, Phi1)) - return replaceInstUsesWith(BO, V); - } - // Fold if there is at least one specific constant value in phi0 or phi1's // incoming values that comes from the same block and this specific constant // value can be used to do optimization for specific binary operator. @@ -3197,6 +3208,64 @@ Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) { return replaceOperand(SI, 0, Op0); } + ConstantInt *SubLHS; + if (match(Cond, m_Sub(m_ConstantInt(SubLHS), m_Value(Op0)))) { + // Change 'switch (1-X) case 1:' into 'switch (X) case 0'. + for (auto Case : SI.cases()) { + Constant *NewCase = ConstantExpr::getSub(SubLHS, Case.getCaseValue()); + assert(isa<ConstantInt>(NewCase) && + "Result of expression should be constant"); + Case.setValue(cast<ConstantInt>(NewCase)); + } + return replaceOperand(SI, 0, Op0); + } + + uint64_t ShiftAmt; + if (match(Cond, m_Shl(m_Value(Op0), m_ConstantInt(ShiftAmt))) && + ShiftAmt < Op0->getType()->getScalarSizeInBits() && + all_of(SI.cases(), [&](const auto &Case) { + return Case.getCaseValue()->getValue().countr_zero() >= ShiftAmt; + })) { + // Change 'switch (X << 2) case 4:' into 'switch (X) case 1:'. + OverflowingBinaryOperator *Shl = cast<OverflowingBinaryOperator>(Cond); + if (Shl->hasNoUnsignedWrap() || Shl->hasNoSignedWrap() || + Shl->hasOneUse()) { + Value *NewCond = Op0; + if (!Shl->hasNoUnsignedWrap() && !Shl->hasNoSignedWrap()) { + // If the shift may wrap, we need to mask off the shifted bits. + unsigned BitWidth = Op0->getType()->getScalarSizeInBits(); + NewCond = Builder.CreateAnd( + Op0, APInt::getLowBitsSet(BitWidth, BitWidth - ShiftAmt)); + } + for (auto Case : SI.cases()) { + const APInt &CaseVal = Case.getCaseValue()->getValue(); + APInt ShiftedCase = Shl->hasNoSignedWrap() ? CaseVal.ashr(ShiftAmt) + : CaseVal.lshr(ShiftAmt); + Case.setValue(ConstantInt::get(SI.getContext(), ShiftedCase)); + } + return replaceOperand(SI, 0, NewCond); + } + } + + // Fold switch(zext/sext(X)) into switch(X) if possible. + if (match(Cond, m_ZExtOrSExt(m_Value(Op0)))) { + bool IsZExt = isa<ZExtInst>(Cond); + Type *SrcTy = Op0->getType(); + unsigned NewWidth = SrcTy->getScalarSizeInBits(); + + if (all_of(SI.cases(), [&](const auto &Case) { + const APInt &CaseVal = Case.getCaseValue()->getValue(); + return IsZExt ? CaseVal.isIntN(NewWidth) + : CaseVal.isSignedIntN(NewWidth); + })) { + for (auto &Case : SI.cases()) { + APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth); + Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase)); + } + return replaceOperand(SI, 0, Op0); + } + } + KnownBits Known = computeKnownBits(Cond, 0, &SI); unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index afb0e6cd1548..e3deafa49bd9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -174,6 +174,8 @@ const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison"; const char kAMDGPUAddressSharedName[] = "llvm.amdgcn.is.shared"; const char kAMDGPUAddressPrivateName[] = "llvm.amdgcn.is.private"; +const char kAMDGPUBallotName[] = "llvm.amdgcn.ballot.i64"; +const char kAMDGPUUnreachableName[] = "llvm.amdgcn.unreachable"; // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; @@ -699,6 +701,8 @@ struct AddressSanitizer { Instruction *InsertBefore, Value *Addr, uint32_t TypeStoreSize, bool IsWrite, Value *SizeArgument); + Instruction *genAMDGPUReportBlock(IRBuilder<> &IRB, Value *Cond, + bool Recover); void instrumentUnusualSizeOrAlignment(Instruction *I, Instruction *InsertBefore, Value *Addr, TypeSize TypeStoreSize, bool IsWrite, @@ -1721,6 +1725,30 @@ Instruction *AddressSanitizer::instrumentAMDGPUAddress( return InsertBefore; } +Instruction *AddressSanitizer::genAMDGPUReportBlock(IRBuilder<> &IRB, + Value *Cond, bool Recover) { + Module &M = *IRB.GetInsertBlock()->getModule(); + Value *ReportCond = Cond; + if (!Recover) { + auto Ballot = M.getOrInsertFunction(kAMDGPUBallotName, IRB.getInt64Ty(), + IRB.getInt1Ty()); + ReportCond = IRB.CreateIsNotNull(IRB.CreateCall(Ballot, {Cond})); + } + + auto *Trm = + SplitBlockAndInsertIfThen(ReportCond, &*IRB.GetInsertPoint(), false, + MDBuilder(*C).createBranchWeights(1, 100000)); + Trm->getParent()->setName("asan.report"); + + if (Recover) + return Trm; + + Trm = SplitBlockAndInsertIfThen(Cond, Trm, false); + IRB.SetInsertPoint(Trm); + return IRB.CreateCall( + M.getOrInsertFunction(kAMDGPUUnreachableName, IRB.getVoidTy()), {}); +} + void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, MaybeAlign Alignment, @@ -1772,7 +1800,15 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, size_t Granularity = 1ULL << Mapping.Scale; Instruction *CrashTerm = nullptr; - if (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity)) { + bool GenSlowPath = (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity)); + + if (TargetTriple.isAMDGCN()) { + if (GenSlowPath) { + auto *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeStoreSize); + Cmp = IRB.CreateAnd(Cmp, Cmp2); + } + CrashTerm = genAMDGPUReportBlock(IRB, Cmp, Recover); + } else if (GenSlowPath) { // We use branch weights for the slow path check, to indicate that the slow // path is rarely taken. This seems to be the case for SPEC benchmarks. Instruction *CheckTerm = SplitBlockAndInsertIfThen( @@ -3629,10 +3665,14 @@ bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, // TODO: We can use vscale_range to convert a scalable value to an // upper bound on the access size. return false; - SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr); - if (!ObjSizeVis.bothKnown(SizeOffset)) return false; - uint64_t Size = SizeOffset.first.getZExtValue(); - int64_t Offset = SizeOffset.second.getSExtValue(); + + SizeOffsetAPInt SizeOffset = ObjSizeVis.compute(Addr); + if (!SizeOffset.bothKnown()) + return false; + + uint64_t Size = SizeOffset.Size.getZExtValue(); + int64_t Offset = SizeOffset.Offset.getSExtValue(); + // Three checks are required to ensure safety: // . Offset >= 0 (since the offset is given from the base ptr) // . Size >= Offset (unsigned) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index ee5b81960417..cfa8ae26c625 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -61,15 +61,15 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal, LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) << " bytes\n"); - SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr); + SizeOffsetValue SizeOffset = ObjSizeEval.compute(Ptr); - if (!ObjSizeEval.bothKnown(SizeOffset)) { + if (!SizeOffset.bothKnown()) { ++ChecksUnable; return nullptr; } - Value *Size = SizeOffset.first; - Value *Offset = SizeOffset.second; + Value *Size = SizeOffset.Size; + Value *Offset = SizeOffset.Offset; ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); Type *IndexTy = DL.getIndexType(Ptr->getType()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index fe5a0578bd97..a19b14087254 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1189,12 +1189,10 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { } static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { - // Don't do this for Darwin. compiler-rt uses linker magic. - if (TT.isOSDarwin()) - return false; - // Use linker script magic to get data/cnts/name start/end. - if (TT.isOSAIX() || TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() || - TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS() || TT.isOSWindows()) + // compiler-rt uses linker support to get data/counters/name start/end for + // ELF, COFF, Mach-O and XCOFF. + if (TT.isOSBinFormatELF() || TT.isOSBinFormatCOFF() || + TT.isOSBinFormatMachO() || TT.isOSBinFormatXCOFF()) return false; return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 3a57709c4e8b..6b95c7028d93 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -330,10 +330,6 @@ extern cl::opt<std::string> ViewBlockFreqFuncName; extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate; } // namespace llvm -static cl::opt<bool> - PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden, - cl::desc("Use the old CFG function hashing")); - // Return a string describing the branch condition that can be // used in static branch probability heuristics: static std::string getBranchCondString(Instruction *TI) { @@ -635,34 +631,25 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { JC.update(Indexes); JamCRC JCH; - if (PGOOldCFGHashing) { - // Hash format for context sensitive profile. Reserve 4 bits for other - // information. - FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 | - (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 | - //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 | - (uint64_t)MST.numEdges() << 32 | JC.getCRC(); + // The higher 32 bits. + auto updateJCH = [&JCH](uint64_t Num) { + uint8_t Data[8]; + support::endian::write64le(Data, Num); + JCH.update(Data); + }; + updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts()); + updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size()); + updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size()); + if (BCI) { + updateJCH(BCI->getInstrumentedBlocksHash()); } else { - // The higher 32 bits. - auto updateJCH = [&JCH](uint64_t Num) { - uint8_t Data[8]; - support::endian::write64le(Data, Num); - JCH.update(Data); - }; - updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts()); - updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size()); - updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size()); - if (BCI) { - updateJCH(BCI->getInstrumentedBlocksHash()); - } else { - updateJCH((uint64_t)MST.numEdges()); - } - - // Hash format for context sensitive profile. Reserve 4 bits for other - // information. - FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC(); + updateJCH((uint64_t)MST.numEdges()); } + // Hash format for context sensitive profile. Reserve 4 bits for other + // information. + FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC(); + // Reserve bit 60-63 for other information purpose. FunctionHash &= 0x0FFFFFFFFFFFFFFF; if (IsCS) @@ -672,10 +659,8 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { << ", Selects = " << SIVisitor.getNumOfSelectInsts() << ", Edges = " << MST.numEdges() << ", ICSites = " << ValueSites[IPVK_IndirectCallTarget].size()); - if (!PGOOldCFGHashing) { - LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size() - << ", High32 CRC = " << JCH.getCRC()); - } + LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size() + << ", High32 CRC = " << JCH.getCRC()); LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";); if (PGOTraceFuncHash != "-" && F.getName().contains(PGOTraceFuncHash)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 06c87bd6dc37..6fec54ac7922 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -517,6 +517,18 @@ static Decomposition decompose(Value *V, return Result; } + // (shl nsw x, shift) is (mul nsw x, (1<<shift)), with the exception of + // shift == bw-1. + if (match(V, m_NSWShl(m_Value(Op0), m_ConstantInt(CI)))) { + uint64_t Shift = CI->getValue().getLimitedValue(); + if (Shift < Ty->getIntegerBitWidth() - 1) { + assert(Shift < 64 && "Would overflow"); + auto Result = decompose(Op0, Preconditions, IsSigned, DL); + Result.mul(int64_t(1) << Shift); + return Result; + } + } + return V; } @@ -644,7 +656,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, // First try to look up \p V in Value2Index and NewVariables. Otherwise add a // new entry to NewVariables. - DenseMap<Value *, unsigned> NewIndexMap; + SmallDenseMap<Value *, unsigned> NewIndexMap; auto GetOrAddIndex = [&Value2Index, &NewVariables, &NewIndexMap](Value *V) -> unsigned { auto V2I = Value2Index.find(V); @@ -668,7 +680,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, IsSigned, IsEq, IsNe); // Collect variables that are known to be positive in all uses in the // constraint. - DenseMap<Value *, bool> KnownNonNegativeVariables; + SmallDenseMap<Value *, bool> KnownNonNegativeVariables; auto &R = Res.Coefficients; for (const auto &KV : VariablesA) { R[GetOrAddIndex(KV.Variable)] += KV.Coefficient; @@ -921,15 +933,20 @@ void State::addInfoForInductions(BasicBlock &BB) { } DomTreeNode *DTN = DT.getNode(InLoopSucc); - auto Inc = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT); - bool MonotonicallyIncreasing = - Inc && *Inc == ScalarEvolution::MonotonicallyIncreasing; - if (MonotonicallyIncreasing) { - // SCEV guarantees that AR does not wrap, so PN >= StartValue can be added - // unconditionally. + auto IncUnsigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT); + auto IncSigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_SGT); + bool MonotonicallyIncreasingUnsigned = + IncUnsigned && *IncUnsigned == ScalarEvolution::MonotonicallyIncreasing; + bool MonotonicallyIncreasingSigned = + IncSigned && *IncSigned == ScalarEvolution::MonotonicallyIncreasing; + // If SCEV guarantees that AR does not wrap, PN >= StartValue can be added + // unconditionally. + if (MonotonicallyIncreasingUnsigned) WorkList.push_back( FactOrCheck::getConditionFact(DTN, CmpInst::ICMP_UGE, PN, StartValue)); - } + if (MonotonicallyIncreasingSigned) + WorkList.push_back( + FactOrCheck::getConditionFact(DTN, CmpInst::ICMP_SGE, PN, StartValue)); APInt StepOffset; if (auto *C = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) @@ -953,11 +970,17 @@ void State::addInfoForInductions(BasicBlock &BB) { WorkList.push_back(FactOrCheck::getConditionFact( DTN, CmpInst::ICMP_UGE, StartValue, PN, ConditionTy(CmpInst::ICMP_ULE, B, StartValue))); + WorkList.push_back(FactOrCheck::getConditionFact( + DTN, CmpInst::ICMP_SGE, StartValue, PN, + ConditionTy(CmpInst::ICMP_SLE, B, StartValue))); // Add PN > B conditional on B <= StartValue which guarantees that the loop // exits when reaching B with a step of -1. WorkList.push_back(FactOrCheck::getConditionFact( DTN, CmpInst::ICMP_UGT, PN, B, ConditionTy(CmpInst::ICMP_ULE, B, StartValue))); + WorkList.push_back(FactOrCheck::getConditionFact( + DTN, CmpInst::ICMP_SGT, PN, B, + ConditionTy(CmpInst::ICMP_SLE, B, StartValue))); return; } @@ -968,37 +991,31 @@ void State::addInfoForInductions(BasicBlock &BB) { return; if (!StepOffset.isOne()) { - auto *UpperGEP = dyn_cast<GetElementPtrInst>(B); - if (!UpperGEP || UpperGEP->getPointerOperand() != StartValue || - !UpperGEP->isInBounds()) - return; - - MapVector<Value *, APInt> UpperVariableOffsets; - APInt UpperConstantOffset(StepOffset.getBitWidth(), 0); - const DataLayout &DL = BB.getModule()->getDataLayout(); - if (!UpperGEP->collectOffset(DL, StepOffset.getBitWidth(), - UpperVariableOffsets, UpperConstantOffset)) - return; - // All variable offsets and the constant offset have to be a multiple of the - // step. - if (!UpperConstantOffset.urem(StepOffset).isZero() || - any_of(UpperVariableOffsets, [&StepOffset](const auto &P) { - return !P.second.urem(StepOffset).isZero(); - })) + // Check whether B-Start is known to be a multiple of StepOffset. + const SCEV *BMinusStart = SE.getMinusSCEV(SE.getSCEV(B), StartSCEV); + if (isa<SCEVCouldNotCompute>(BMinusStart) || + !SE.getConstantMultiple(BMinusStart).urem(StepOffset).isZero()) return; } // AR may wrap. Add PN >= StartValue conditional on StartValue <= B which // guarantees that the loop exits before wrapping in combination with the // restrictions on B and the step above. - if (!MonotonicallyIncreasing) { + if (!MonotonicallyIncreasingUnsigned) WorkList.push_back(FactOrCheck::getConditionFact( DTN, CmpInst::ICMP_UGE, PN, StartValue, ConditionTy(CmpInst::ICMP_ULE, StartValue, B))); - } + if (!MonotonicallyIncreasingSigned) + WorkList.push_back(FactOrCheck::getConditionFact( + DTN, CmpInst::ICMP_SGE, PN, StartValue, + ConditionTy(CmpInst::ICMP_SLE, StartValue, B))); + WorkList.push_back(FactOrCheck::getConditionFact( DTN, CmpInst::ICMP_ULT, PN, B, ConditionTy(CmpInst::ICMP_ULE, StartValue, B))); + WorkList.push_back(FactOrCheck::getConditionFact( + DTN, CmpInst::ICMP_SLT, PN, B, + ConditionTy(CmpInst::ICMP_SLE, StartValue, B))); } void State::addInfoFor(BasicBlock &BB) { @@ -1655,15 +1672,14 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, DFSInStack); } - LLVM_DEBUG(dbgs() << "Processing "); - // For a block, check if any CmpInsts become known based on the current set // of constraints. if (CB.isCheck()) { Instruction *Inst = CB.getInstructionToSimplify(); if (!Inst) continue; - LLVM_DEBUG(dbgs() << "condition to simplify: " << *Inst << "\n"); + LLVM_DEBUG(dbgs() << "Processing condition to simplify: " << *Inst + << "\n"); if (auto *II = dyn_cast<WithOverflowInst>(Inst)) { Changed |= tryToSimplifyOverflowMath(II, Info, ToRemove); } else if (auto *Cmp = dyn_cast<ICmpInst>(Inst)) { @@ -1682,7 +1698,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, } auto AddFact = [&](CmpInst::Predicate Pred, Value *A, Value *B) { - LLVM_DEBUG(dbgs() << "fact to add to the system: "; + LLVM_DEBUG(dbgs() << "Processing fact to add to the system: "; dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "\n"); if (Info.getCS(CmpInst::isSigned(Pred)).size() > MaxRows) { LLVM_DEBUG( @@ -1731,8 +1747,17 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, A = CB.Cond.Op0; B = CB.Cond.Op1; if (CB.DoesHold.Pred != CmpInst::BAD_ICMP_PREDICATE && - !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1)) + !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1)) { + LLVM_DEBUG({ + dbgs() << "Not adding fact "; + dumpUnpackedICmp(dbgs(), Pred, A, B); + dbgs() << " because precondition "; + dumpUnpackedICmp(dbgs(), CB.DoesHold.Pred, CB.DoesHold.Op0, + CB.DoesHold.Op1); + dbgs() << " does not hold.\n"; + }); continue; + } } else { bool Matched = match(CB.Inst, m_Intrinsic<Intrinsic::assume>( m_ICmp(Pred, m_Value(A), m_Value(B)))); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index c44d3748a80d..9235850de92f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -94,6 +94,31 @@ STATISTIC(NumUDivURemsNarrowedExpanded, "Number of bound udiv's/urem's expanded"); STATISTIC(NumZExt, "Number of non-negative deductions"); +static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { + if (Constant *C = LVI->getConstant(V, At)) + return C; + + // TODO: The following really should be sunk inside LVI's core algorithm, or + // at least the outer shims around such. + auto *C = dyn_cast<CmpInst>(V); + if (!C) + return nullptr; + + Value *Op0 = C->getOperand(0); + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) + return nullptr; + + LazyValueInfo::Tristate Result = LVI->getPredicateAt( + C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false); + if (Result == LazyValueInfo::Unknown) + return nullptr; + + return (Result == LazyValueInfo::True) + ? ConstantInt::getTrue(C->getContext()) + : ConstantInt::getFalse(C->getContext()); +} + static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { if (S->getType()->isVectorTy() || isa<Constant>(S->getCondition())) return false; @@ -106,7 +131,7 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { C = LVI->getConstantOnEdge(S->getCondition(), PN->getIncomingBlock(U), I->getParent(), I); else - C = LVI->getConstant(S->getCondition(), I); + C = getConstantAt(S->getCondition(), I, LVI); auto *CI = dyn_cast_or_null<ConstantInt>(C); if (!CI) @@ -1109,30 +1134,6 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) { return true; } - -static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { - if (Constant *C = LVI->getConstant(V, At)) - return C; - - // TODO: The following really should be sunk inside LVI's core algorithm, or - // at least the outer shims around such. - auto *C = dyn_cast<CmpInst>(V); - if (!C) return nullptr; - - Value *Op0 = C->getOperand(0); - Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); - if (!Op1) return nullptr; - - LazyValueInfo::Tristate Result = LVI->getPredicateAt( - C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false); - if (Result == LazyValueInfo::Unknown) - return nullptr; - - return (Result == LazyValueInfo::True) ? - ConstantInt::getTrue(C->getContext()) : - ConstantInt::getFalse(C->getContext()); -} - static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, const SimplifyQuery &SQ) { bool FnChanged = false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index 656abdb0abbf..75cddfa16d6d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1097,10 +1097,8 @@ private: // For array or vector indices, scale the index by the size of the // type. APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); - GEPOffset += - Index * - APInt(Offset.getBitWidth(), - DL.getTypeAllocSize(GTI.getIndexedType()).getFixedValue()); + GEPOffset += Index * APInt(Offset.getBitWidth(), + GTI.getSequentialElementStride(DL)); } // If this index has computed an intermediate pointer which is not diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index b8c9d9d100f1..225dd454068c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -843,7 +843,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, // constant offset to a byte offset, and later offset the remainder of // the original GEP with this byte offset. AccumulativeByteOffset += - ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); + ConstantOffset * GTI.getSequentialElementStride(*DL); } } else if (LowerGEP) { StructType *StTy = GTI.getStructType(); @@ -884,7 +884,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( continue; APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(), - DL->getTypeAllocSize(GTI.getIndexedType())); + GTI.getSequentialElementStride(*DL)); // Scale the index by element size. if (ElementSize != 1) { if (ElementSize.isPowerOf2()) { @@ -946,7 +946,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, continue; APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - DL->getTypeAllocSize(GTI.getIndexedType())); + GTI.getSequentialElementStride(*DL)); // Scale the index by element size. if (ElementSize != 1) { if (ElementSize.isPowerOf2()) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 543469d62fe7..ca1f3a0c0ae3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -547,7 +547,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( // indices except this current one. const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs); Value *ArrayIdx = GEP->getOperand(I); - uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + uint64_t ElementSize = GTI.getSequentialElementStride(*DL); if (ArrayIdx->getType()->getIntegerBitWidth() <= DL->getIndexSizeInBits(GEP->getAddressSpace())) { // Skip factoring if ArrayIdx is wider than the index size, because diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp index 0990c750af55..ea3135630665 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -33,37 +33,37 @@ STATISTIC(NumVFDeclAdded, STATISTIC(NumCompUsedAdded, "Number of `@llvm.compiler.used` operands that have been added."); -/// A helper function that adds the vector function declaration that -/// vectorizes the CallInst CI with a vectorization factor of VF -/// lanes. The TLI assumes that all parameters and the return type of -/// CI (other than void) need to be widened to a VectorType of VF -/// lanes. +/// A helper function that adds the vector variant declaration for vectorizing +/// the CallInst \p CI with a vectorization factor of \p VF lanes. For each +/// mapping, TLI provides a VABI prefix, which contains all information required +/// to create vector function declaration. static void addVariantDeclaration(CallInst &CI, const ElementCount &VF, - bool Predicate, const StringRef VFName) { + const VecDesc *VD) { Module *M = CI.getModule(); + FunctionType *ScalarFTy = CI.getFunctionType(); - // Add function declaration. - Type *RetTy = ToVectorTy(CI.getType(), VF); - SmallVector<Type *, 4> Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - assert(!CI.getFunctionType()->isVarArg() && - "VarArg functions are not supported."); - if (Predicate) - Tys.push_back(ToVectorTy(Type::getInt1Ty(RetTy->getContext()), VF)); - FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false); - Function *VectorF = - Function::Create(FTy, Function::ExternalLinkage, VFName, M); - VectorF->copyAttributesFrom(CI.getCalledFunction()); + assert(!ScalarFTy->isVarArg() && "VarArg functions are not supported."); + + const std::optional<VFInfo> Info = VFABI::tryDemangleForVFABI( + VD->getVectorFunctionABIVariantString(), ScalarFTy); + + assert(Info && "Failed to demangle vector variant"); + assert(Info->Shape.VF == VF && "Mangled name does not match VF"); + + const StringRef VFName = VD->getVectorFnName(); + FunctionType *VectorFTy = VFABI::createFunctionType(*Info, ScalarFTy); + Function *VecFunc = + Function::Create(VectorFTy, Function::ExternalLinkage, VFName, M); + VecFunc->copyAttributesFrom(CI.getCalledFunction()); ++NumVFDeclAdded; LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName - << "` of type " << *(VectorF->getType()) << "\n"); + << "` of type " << *VectorFTy << "\n"); // Make function declaration (without a body) "sticky" in the IR by // listing it in the @llvm.compiler.used intrinsic. - assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` " + assert(!VecFunc->size() && "VFABI attribute requires `@llvm.compiler.used` " "only on declarations."); - appendToCompilerUsed(*M, {VectorF}); + appendToCompilerUsed(*M, {VecFunc}); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName << "` to `@llvm.compiler.used`.\n"); ++NumCompUsedAdded; @@ -100,7 +100,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { } Function *VariantF = M->getFunction(VD->getVectorFnName()); if (!VariantF) - addVariantDeclaration(CI, VF, Predicate, VD->getVectorFnName()); + addVariantDeclaration(CI, VF, VD); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp index ab95698abc43..3dc6016a0a37 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -310,6 +310,7 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); } + DefaultDest->removePredecessor(BB); SI->setDefaultDest(NewUnreachableBB); Updates.push_back({DominatorTree::Delete, BB, DefaultDest}); Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB}); @@ -1063,14 +1064,17 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, // is ready. if (SCValue.isConstantRange(/*UndefAllowed=*/false)) { const ConstantRange &Range = SCValue.getConstantRange(); + unsigned ReachableCaseCount = 0; for (const auto &Case : SI->cases()) { const APInt &CaseValue = Case.getCaseValue()->getValue(); - if (Range.contains(CaseValue)) + if (Range.contains(CaseValue)) { Succs[Case.getSuccessorIndex()] = true; + ++ReachableCaseCount; + } } - // TODO: Determine whether default case is reachable. - Succs[SI->case_default()->getSuccessorIndex()] = true; + Succs[SI->case_default()->getSuccessorIndex()] = + Range.isSizeLargerThan(ReachableCaseCount); return; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 55e375670cc6..61d891d65346 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5414,11 +5414,13 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { } static void createUnreachableSwitchDefault(SwitchInst *Switch, - DomTreeUpdater *DTU) { + DomTreeUpdater *DTU, + bool RemoveOrigDefaultBlock = true) { LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); auto *BB = Switch->getParent(); auto *OrigDefaultBlock = Switch->getDefaultDest(); - OrigDefaultBlock->removePredecessor(BB); + if (RemoveOrigDefaultBlock) + OrigDefaultBlock->removePredecessor(BB); BasicBlock *NewDefaultBlock = BasicBlock::Create( BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), OrigDefaultBlock); @@ -5427,7 +5429,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch, if (DTU) { SmallVector<DominatorTree::UpdateType, 2> Updates; Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock}); - if (!is_contained(successors(BB), OrigDefaultBlock)) + if (RemoveOrigDefaultBlock && + !is_contained(successors(BB), OrigDefaultBlock)) Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock}); DTU->applyUpdates(Updates); } @@ -5609,10 +5612,28 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */ && - SI->getNumCases() == (1ULL << NumUnknownBits)) { - createUnreachableSwitchDefault(SI, DTU); - return true; + NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = + cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } if (DeadCases.empty()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 760a626c8b6f..a7cd68e860e4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -3735,26 +3735,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { // Also try to simplify calls to fortified library functions. if (Value *SimplifiedFortifiedCI = - FortifiedSimplifier.optimizeCall(CI, Builder)) { - // Try to further simplify the result. - CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); - if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { - // Ensure that SimplifiedCI's uses are complete, since some calls have - // their uses analyzed. - replaceAllUsesWith(CI, SimplifiedCI); - - // Set insertion point to SimplifiedCI to guarantee we reach all uses - // we might replace later on. - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(SimplifiedCI); - if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { - // If we were able to further simplify, remove the now redundant call. - substituteInParent(SimplifiedCI, V); - return V; - } - } + FortifiedSimplifier.optimizeCall(CI, Builder)) return SimplifiedFortifiedCI; - } // Then check for known library functions. if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index fa2459d1ca02..1f11d4894f77 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1193,7 +1193,7 @@ std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( OpA->getType() != OpB->getType()) return std::nullopt; - uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); + uint64_t Stride = GTIA.getSequentialElementStride(DL); // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 577ce8000de2..cff72ae263d8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,9 +167,14 @@ public: } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL, const Twine &Name = "") { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, - Name); + DebugLoc DL, const Twine &Name = "", + std::optional<FastMathFlags> FMFs = std::nullopt) { + auto *Select = + FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, + *FMFs, DL, Name) + : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, + DL, Name); + return tryInsertInstruction(Select); } /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A @@ -341,16 +346,20 @@ public: /// Return the best VPlan for \p VF. VPlan &getBestPlanFor(ElementCount VF) const; - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected \p VF, \p UF and VPlan \p BestPlan. + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan + /// according to the best selected \p VF and \p UF. + /// /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue /// vectorization re-using plans for both the main and epilogue vector loops. /// It should be removed once the re-use issue has been fixed. /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop - /// to re-use expansion results generated during main plan execution. Returns - /// a mapping of SCEVs to their expanded IR values. Note that this is a - /// temporary workaround needed due to the current epilogue handling. - DenseMap<const SCEV *, Value *> + /// to re-use expansion results generated during main plan execution. + /// + /// Returns a mapping of SCEVs to their expanded IR values and a mapping for + /// the reduction resume values. Note that this is a temporary workaround + /// needed due to the current epilogue handling. + std::pair<DenseMap<const SCEV *, Value *>, + DenseMap<const RecurrenceDescriptor *, Value *>> executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8e135d80f4f2..51ce88480c08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -584,10 +584,6 @@ public: /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - // Returns the resume value (bc.merge.rdx) for a reduction as - // generated by fixReduction. - PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); - /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases @@ -626,9 +622,6 @@ protected: BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State); - /// Handle all cross-iteration phis in the header. - void fixCrossIterationPHIs(VPTransformState &State); - /// Create the exit value of first order recurrences in the middle block and /// update their users. void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, @@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } } -PHINode *InnerLoopVectorizer::getReductionResumeValue( - const RecurrenceDescriptor &RdxDesc) { - auto It = ReductionResumeValues.find(&RdxDesc); - assert(It != ReductionResumeValues.end() && - "Expected to find a resume value for the reduction."); - return It->second; -} - namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. - // This is the second stage of vectorizing recurrences. - fixCrossIterationPHIs(State); + // This is the second stage of vectorizing recurrences. Note that fixing + // reduction phis are already modeled in VPlan. + // TODO: Also model fixing fixed-order recurrence phis in VPlan. + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : HeaderVPBB->phis()) { + if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) + fixFixedOrderRecurrence(FOR, State); + } // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); @@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted @@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #2: We now need to fix the recurrences by adding incoming edges to - // the currently empty PHI nodes. At this point every instruction in the - // original loop is widened to a vector form so we can use them to construct - // the incoming edges. - VPBasicBlock *Header = - State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); - - for (VPRecipeBase &R : Header->phis()) { - if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) - fixReduction(ReductionPhi, State); - } - - for (VPRecipeBase &R : Header->phis()) { - if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) - fixFixedOrderRecurrence(FOR, State); - } -} - void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An @@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Phi->setName("scalar.recur"); } -void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, - VPTransformState &State) { - PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); - // Get it's reduction variable descriptor. - assert(Legal->isReductionVariable(OrigPhi) && - "Unable to find the reduction variable"); - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - - RecurKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue)) - State.setDebugLocFrom(I->getDebugLoc()); - - VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); - - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - - State.setDebugLocFrom(LoopExitInst->getDebugLoc()); - - Type *PhiTy = OrigPhi->getType(); - // If tail is folded by masking, the vector value to leave the loop should be - // a Select choosing between the vectorized LoopExitInst and vectorized Phi, - // instead of the former. For an inloop reduction the reduction will already - // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { - VPValue *Def = nullptr; - for (VPUser *U : LoopExitInstDef->users()) { - auto *S = dyn_cast<VPInstruction>(U); - if (S && S->getOpcode() == Instruction::Select) { - Def = S; - break; - } - } - if (Def) - LoopExitInstDef = Def; - } - - VectorParts RdxParts(UF); - for (unsigned Part = 0; Part < UF; ++Part) - RdxParts[Part] = State.get(LoopExitInstDef, Part); - - // If the vector reduction can be performed in a smaller type, we truncate - // then extend the loop exit value to enable InstCombine to evaluate the - // entire expression in the smaller type. - if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { - Builder.SetInsertPoint(LoopMiddleBlock, - LoopMiddleBlock->getFirstInsertionPt()); - Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); - for (unsigned Part = 0; Part < UF; ++Part) { - RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); - } - } - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getOpcode(RK); - - // The middle block terminator has already been assigned a DebugLoc here (the - // OrigLoop's single latch terminator). We want the whole middle block to - // appear to execute on this line because: (a) it is all compiler generated, - // (b) these instructions are always executed after evaluating the latch - // conditional branch, and (c) other passes may add new predecessors which - // terminate on this line. This is the easiest way to ensure we don't - // accidentally cause an extra step back into the loop while debugging. - State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc()); - if (PhiR->isOrdered()) - ReducedPartRdx = RdxParts[UF - 1]; - else { - // Floating-point operations should have some FMF to enable the reduction. - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = RdxParts[Part]; - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, - ReducedPartRdx, RdxPart); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - } - } - - // Create the reduction after the loop. Note that inloop reductions create the - // target reduction in the loop using a Reduction recipe. - if (VF.isVector() && !PhiR->isInLoop()) { - ReducedPartRdx = - createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); - // If the reduction can be performed in a smaller type, we need to extend - // the reduction to the wider type before we branch to the original loop. - if (PhiTy != RdxDesc.getRecurrenceType()) - ReducedPartRdx = RdxDesc.isSigned() - ? Builder.CreateSExt(ReducedPartRdx, PhiTy) - : Builder.CreateZExt(ReducedPartRdx, PhiTy); - } - - PHINode *ResumePhi = - dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); - - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - - // If we are fixing reductions in the epilogue loop then we should already - // have created a bc.merge.rdx Phi after the main vector body. Ensure that - // we carry over the incoming values correctly. - for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); - else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); - } - - // Set the resume value for this reduction - ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); - - // If there were stores of the reduction value to a uniform memory address - // inside the loop, create the final store here. - if (StoreInst *SI = RdxDesc.IntermediateStore) { - StoreInst *NewSI = - Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(), - SI->getAlign()); - propagateMetadata(NewSI, SI); - - // If the reduction value is used in other places, - // then let the code below create PHI's for that. - } - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - - // We know that the loop is in LCSSA form. We need to update the PHI nodes - // in the exit blocks. See comment on analogous loop in - // fixFixedOrderRecurrence for a more complete explaination of the logic. - if (!Cost->requiresScalarEpilogue(VF.isVector())) - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { - LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); - State.Plan->removeLiveOut(&LCSSAPhi); - } - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If trip count is known or estimated compile time constant, limit the - // interleave count to be less than the trip count divided by VF, provided it - // is at least 1. - // - // For scalable vectors we can't know if interleaving is beneficial. It may - // not be beneficial for small loops if none of the lanes in the second vector - // iterations is enabled. However, for larger loops, there is likely to be a - // similar benefit as for fixed-width vectors. For now, we choose to leave - // the InterleaveCount as if vscale is '1', although if some information about - // the vector is known (e.g. min vector size), we can make a better decision. - if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); - // Make sure MaxInterleaveCount is greater than 0. - MaxInterleaveCount = std::max(1u, MaxInterleaveCount); + unsigned EstimatedVF = VF.getKnownMinValue(); + if (VF.isScalable()) { + if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) + EstimatedVF *= *VScale; + } + assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); + + unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + if (KnownTC) { + // If trip count is known we select between two prospective ICs, where + // 1) the aggressive IC is capped by the trip count divided by VF + // 2) the conservative IC is capped by the trip count divided by (VF * 2) + // The final IC is selected in a way that the epilogue loop trip count is + // minimized while maximizing the IC itself, so that we either run the + // vector loop at least once if it generates a small epilogue loop, or else + // we run the vector loop at least twice. + + unsigned InterleaveCountUB = bit_floor( + std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); + unsigned InterleaveCountLB = bit_floor(std::max( + 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + MaxInterleaveCount = InterleaveCountLB; + + if (InterleaveCountUB != InterleaveCountLB) { + unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); + unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); + // If both produce same scalar tail, maximize the IC to do the same work + // in fewer vector loop iterations + if (TailTripCountUB == TailTripCountLB) + MaxInterleaveCount = InterleaveCountUB; + } + } else if (BestKnownTC) { + // If trip count is an estimated compile time constant, limit the + // IC to be capped by the trip count divided by VF * 2, such that the vector + // loop runs at least twice to make interleaving seem profitable when there + // is an epilogue loop present. Since exact Trip count is not known we + // choose to be conservative in our IC estimate. + MaxInterleaveCount = bit_floor(std::max( + 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); } assert(MaxInterleaveCount > 0 && @@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -SCEV2ValueTy LoopVectorizationPlanner::executePlan( +// Check if \p RedResult is a ComputeReductionResult instruction, and if it is +// create a merge phi node for it and add it to \p ReductionResumeValues. +static void createAndCollectMergePhiForReduction( + VPInstruction *RedResult, + DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, + VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { + if (!RedResult || + RedResult->getOpcode() != VPInstruction::ComputeReductionResult) + return; + + auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Value *FinalValue = + State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); + auto *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + + // TODO: bc.merge.rdx should not be created here, instead it should be + // modeled in VPlan. + BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(FinalValue, Incoming); + else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + // TODO: This fixup should instead be modeled in VPlan. + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = + OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); + OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + + ReductionResumeValues[&RdxDesc] = BCBlockPhi; +} + +std::pair<DenseMap<const SCEV *, Value *>, + DenseMap<const RecurrenceDescriptor *, Value *>> +LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { @@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); + // 2.5 Collect reduction resume values. + DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; + auto *ExitVPBB = + cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + for (VPRecipeBase &R : *ExitVPBB) { + createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), + ReductionResumeValues, State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB]); + } + + // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). MDNode *OrigLoopID = OrigLoop->getLoopID(); @@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); - return State.ExpandedSCEVs; + return {State.ExpandedSCEVs, ReductionResumeValues}; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (ECEntryIt != EdgeMaskCache.end()) return ECEntryIt->second; - VPValue *SrcMask = createBlockInMask(Src, Plan); + VPValue *SrcMask = getBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); @@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { BlockMaskCache[Header] = BlockMask; } -VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - - // Look for cached value. - BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); - if (BCEntryIt != BlockMaskCache.end()) - return BCEntryIt->second; +VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { + // Return the cached value. + BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); + assert(BCEntryIt != BlockMaskCache.end() && + "Trying to access mask for block without one."); + return BCEntryIt->second; +} +void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); assert(OrigLoop->getHeader() != BB && "Loop header must have cached block mask"); @@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); - if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. - return BlockMaskCache[BB] = EdgeMask; + if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. + BlockMaskCache[BB] = EdgeMask; + } if (!BlockMask) { // BlockMask has its initialized nullptr value. BlockMask = EdgeMask; @@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } - return BlockMaskCache[BB] = BlockMask; + BlockMaskCache[BB] = BlockMask; } VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, @@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), *Plan); + Mask = getBlockInMask(I->getParent()); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; if (Consecutive) { - auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), - Reverse, I->getDebugLoc()); + auto *GEP = dyn_cast<GetElementPtrInst>( + Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, + I->getDebugLoc()); Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } @@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // all-true mask. VPValue *Mask = nullptr; if (Legal->isMaskRequired(CI)) - Mask = createBlockInMask(CI->getParent(), *Plan); + Mask = getBlockInMask(CI->getParent()); else Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); @@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); - VPValue *Mask = createBlockInMask(I->getParent(), *Plan); + VPValue *Mask = getBlockInMask(I->getParent()); VPValue *One = Plan->getVPValueOrAddLiveIn( ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = @@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, // added initially. Masked replicate recipes will later be placed under an // if-then construct to prevent side-effects. Generate recipes to compute // the block mask for this region. - BlockInMask = createBlockInMask(I->getParent(), Plan); + BlockInMask = getBlockInMask(I->getParent()); } auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), @@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - // Proactively create header mask. Masks for other blocks are created on - // demand. - RecipeBuilder.createHeaderMask(*Plan); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; + bool NeedsMasks = CM.foldTailByMasking() || + any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + }); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); + if (VPBB == HeaderVPBB) + RecipeBuilder.createHeaderMask(*Plan); + else if (NeedsMasks) + RecipeBuilder.createBlockInMask(BB, *Plan); + // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { @@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // to reductions, with one operand being vector and the other being the scalar // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. +// +// A ComputeReductionResult recipe is added to the middle block, also for +// in-loop reductions which compute their result in-loop, because generating +// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { - VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores // sank outside of the loop would keep the same order as they had in the // original loop. @@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase *R : ReductionPHIList) R->moveBefore(*Header, Header->getFirstNonPhi()); - SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis; for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) continue; - InLoopReductionPhis.push_back(PhiR); - } - for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) { const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind Kind = RdxDesc.getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && @@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(CurrentLink); - CondOp = RecipeBuilder.createBlockInMask(BB, *Plan); + CondOp = RecipeBuilder.getBlockInMask(BB); } VPReductionRecipe *RedRecipe = new VPReductionRecipe( @@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); - if (!PhiR || PhiR->isInLoop()) + if (!PhiR) continue; const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe(); // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { - VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); - VPValue *Red = PhiR->getBackedgeValue(); - assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && + auto *OrigExitingVPV = PhiR->getBackedgeValue(); + auto *NewExitingVPV = PhiR->getBackedgeValue(); + if (!PhiR->isInLoop() && CM.foldTailByMasking()) { + VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); + assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); - FastMathFlags FMFs = RdxDesc.getFastMathFlags(); Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); - Result = + std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() - ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs) - : new VPInstruction(Instruction::Select, {Cond, Red, PhiR}); - Result->insertBefore(&*Builder.getInsertPoint()); - Red->replaceUsesWithIf( - Result->getVPSingleValue(), - [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); }); + ? std::make_optional(RdxDesc.getFastMathFlags()) + : std::nullopt; + NewExitingVPV = + Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); + OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { + return isa<VPInstruction>(&U) && + cast<VPInstruction>(&U)->getOpcode() == + VPInstruction::ComputeReductionResult; + }); if (PreferPredicatedReductionSelect || TTI.preferPredicatedReductionSelect( PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) - PhiR->setOperand(1, Result->getVPSingleValue()); + PhiR->setOperand(1, NewExitingVPV); } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxTy = RdxDesc.getRecurrenceType(); - auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc, - Result->getVPSingleValue(), RdxTy); + auto *Trunc = + new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); auto *Extnd = RdxDesc.isSigned() ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); - Trunc->insertAfter(Result); + Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); Extnd->insertAfter(Trunc); - Result->getVPSingleValue()->replaceAllUsesWith(Extnd); - Trunc->setOperand(0, Result->getVPSingleValue()); + if (PhiR->getOperand(1) == NewExitingVPV) + PhiR->setOperand(1, Extnd->getVPSingleValue()); + NewExitingVPV = Extnd; } + + // We want code in the middle block to appear to execute on the location of + // the scalar loop's latch terminator because: (a) it is all compiler + // generated, (b) these instructions are always executed after evaluating + // the latch conditional branch, and (c) other passes may add new + // predecessors which terminate on this line. This is the easiest way to + // ensure we don't accidentally cause an extra step back into the loop while + // debugging. + DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); + + // TODO: At the moment ComputeReductionResult also drives creation of the + // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here + // even for in-loop reductions, until the reduction resume value handling is + // also modeled in VPlan. + auto *FinalReductionResult = new VPInstruction( + VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) + ->appendRecipe(FinalReductionResult); + OrigExitingVPV->replaceUsesWithIf( + FinalReductionResult, + [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); } VPlanTransforms::clearReductionWrapFlags(*Plan); @@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - BestMainPlan, MainILV, DT, true); + const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( + EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { - ResumeV = MainILV.getReductionResumeValue( - ReductionPhi->getRecurrenceDescriptor()); + ResumeV = ReductionResumeValues + .find(&ReductionPhi->getRecurrenceDescriptor()) + ->second; } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 304991526064..8e22b54f002d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10596,7 +10596,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) { + auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF, + unsigned I, unsigned SliceSize) { if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { return isa<UndefValue>(V) && !isa<PoisonValue>(V); })) @@ -10619,11 +10620,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { Idx == 0) || (Mask.size() == InputVF && ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { - std::iota(Mask.begin(), Mask.end(), 0); + std::iota(std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), (I + 1) * SliceSize), 0); } else { - unsigned I = + unsigned IVal = *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); - std::fill(Mask.begin(), Mask.end(), I); + std::fill(std::next(Mask.begin(), I * SliceSize), + std::next(Mask.begin(), (I + 1) * SliceSize), IVal); } return true; }; @@ -10872,7 +10875,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } else if (Vec1) { IsUsedInExpr &= FindReusedSplat( ExtractMask, - cast<FixedVectorType>(Vec1->getType())->getNumElements()); + cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0, + ExtractMask.size()); ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { @@ -10898,7 +10902,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); if (TEs.size() == 1) { IsUsedInExpr &= - FindReusedSplat(VecMask, TEs.front()->getVectorFactor()); + FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize); ShuffleBuilder.add(*TEs.front(), VecMask); if (TEs.front()->VectorizedValue) IsNonPoisoned &= @@ -11139,6 +11143,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); + if (const TreeEntry *TE = getTreeEntry(V)) + V = TE->VectorizedValue; setInsertPointAfterBundle(E); V = FinalShuffle(V, E, VecTy, IsSigned); E->VectorizedValue = V; @@ -11903,8 +11909,10 @@ Value *BoUpSLP::vectorizeTree( if (!Ex) { // "Reuse" the existing extract to improve final codegen. if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { - Ex = Builder.CreateExtractElement(ES->getOperand(0), - ES->getOperand(1)); + Value *V = ES->getVectorOperand(); + if (const TreeEntry *ETE = getTreeEntry(V)) + V = ETE->VectorizedValue; + Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); } else { Ex = Builder.CreateExtractElement(Vec, Lane); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 7ff6749a0908..4b3143aead46 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -138,8 +138,11 @@ public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True or the loop mask when - /// tail folding. It returns the *entry* mask for the block BB. - VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); + /// tail folding. + void createBlockInMask(BasicBlock *BB, VPlan &Plan); + + /// Returns the *entry* mask for the block \p BB. + VPValue *getBlockInMask(BasicBlock *BB) const; /// A helper function that computes the predicate of the edge between SRC /// and DST. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d7df9c9575a..b6e56c47c227 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) { // ExitBB can be re-used for the exit block of the Plan. NewBB = State->CFG.ExitBB; State->CFG.PrevBB = NewBB; + State->Builder.SetInsertPoint(NewBB->getFirstNonPHI()); // Update the branch instruction in the predecessor to branch to ExitBB. VPBlockBase *PredVPB = getSingleHierarchicalPredecessor(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 7d33baac52c9..4b4f4911eb64 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -842,6 +842,12 @@ public: WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {} }; +protected: + struct GEPFlagsTy { + char IsInBounds : 1; + GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {} + }; + private: struct DisjointFlagsTy { char IsDisjoint : 1; @@ -849,9 +855,6 @@ private: struct ExactFlagsTy { char IsExact : 1; }; - struct GEPFlagsTy { - char IsInBounds : 1; - }; struct NonNegFlagsTy { char NonNeg : 1; }; @@ -933,12 +936,21 @@ public: : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp), FMFs(FMFs) {} +protected: + template <typename IterT> + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + GEPFlagsTy GEPFlags, DebugLoc DL = {}) + : VPRecipeBase(SC, Operands, DL), OpType(OperationType::GEPOp), + GEPFlags(GEPFlags) {} + +public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPInstructionSC || R->getVPDefID() == VPRecipeBase::VPWidenSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || - R->getVPDefID() == VPRecipeBase::VPReplicateSC; + R->getVPDefID() == VPRecipeBase::VPReplicateSC || + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } /// Drop all poison-generating flags. @@ -1061,7 +1073,8 @@ public: // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, BranchOnCount, - BranchOnCond + BranchOnCond, + ComputeReductionResult, }; private: @@ -1360,15 +1373,16 @@ public: /// A recipe to compute the pointers for widened memory accesses of IndexTy for /// all parts. If IsReverse is true, compute pointers for accessing the input in /// reverse order per part. -class VPVectorPointerRecipe : public VPRecipeBase, public VPValue { +class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue { Type *IndexedTy; bool IsReverse; public: VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse, - DebugLoc DL) - : VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this), - IndexedTy(IndexedTy), IsReverse(IsReverse) {} + bool IsInBounds, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), + GEPFlagsTy(IsInBounds), DL), + VPValue(this), IndexedTy(IndexedTy), IsReverse(IsReverse) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -3132,6 +3146,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { return Rep->isUniform(); if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def)) return all_of(GEP->operands(), isUniformAfterVectorization); + if (auto *VPI = dyn_cast<VPInstruction>(Def)) + return VPI->getOpcode() == VPInstruction::ComputeReductionResult; return false; } } // end namespace vputils diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 76961629aece..1f844bce2310 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> @@ -119,7 +120,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { return false; case VPInstructionSC: switch (cast<VPInstruction>(this)->getOpcode()) { + case Instruction::Or: case Instruction::ICmp: + case Instruction::Select: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: @@ -401,6 +404,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::ComputeReductionResult: { + if (Part != 0) + return State.get(this, 0); + + // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary + // and will be removed by breaking up the recipe further. + auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); + auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); + // Get its reduction variable descriptor. + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + + RecurKind RK = RdxDesc.getRecurrenceKind(); + + State.setDebugLocFrom(getDebugLoc()); + + VPValue *LoopExitingDef = getOperand(1); + Type *PhiTy = OrigPhi->getType(); + VectorParts RdxParts(State.UF); + for (unsigned Part = 0; Part < State.UF; ++Part) + RdxParts[Part] = State.get(LoopExitingDef, Part); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + // TODO: Handle this in truncateToMinBW. + if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); + for (unsigned Part = 0; Part < State.UF; ++Part) + RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); + } + // Reduce all of the unrolled parts into a single vector. + Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = RecurrenceDescriptor::getOpcode(RK); + + if (PhiR->isOrdered()) { + ReducedPartRdx = RdxParts[State.UF - 1]; + } else { + // Floating-point operations should have some FMF to enable the reduction. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + for (unsigned Part = 1; Part < State.UF; ++Part) { + Value *RdxPart = RdxParts[Part]; + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + TrackingVH<Value> ReductionStartValue = + RdxDesc.getRecurrenceStartValue(); + ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + } else + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } + } + + // Create the reduction after the loop. Note that inloop reductions create + // the target reduction in the loop using a Reduction recipe. + if (State.VF.isVector() && !PhiR->isInLoop()) { + ReducedPartRdx = + createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (PhiTy != RdxDesc.getRecurrenceType()) + ReducedPartRdx = RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, PhiTy) + : Builder.CreateZExt(ReducedPartRdx, PhiTy); + } + + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + auto *NewSI = Builder.CreateAlignedStore( + ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); + propagateMetadata(NewSI, SI); + } + + return ReducedPartRdx; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -477,6 +558,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCount: O << "branch-on-count"; break; + case VPInstruction::ComputeReductionResult: + O << "compute-reduction-result"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1225,9 +1309,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { ? DL.getIndexType(IndexedTy->getPointerTo()) : Builder.getInt32Ty(); Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); - bool InBounds = false; - if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = GEP->isInBounds(); + bool InBounds = isInBounds(); if (IsReverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 33132880d5a4..5c430620a2dc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -829,15 +829,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Type *ATy = TypeInfo.inferScalarType(A); if (TruncTy == ATy) { Trunc->replaceAllUsesWith(A); - } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { - auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); + } else { + // Don't replace a scalarizing recipe with a widened cast. + if (isa<VPReplicateRecipe>(&R)) + break; + if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } } #ifndef NDEBUG // Verify that the cached type info is for both A and its users is still |