aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp62
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h23
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp135
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp50
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp51
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp91
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp51
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp46
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp35
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h27
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp481
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h34
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp88
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp23
30 files changed, 794 insertions, 599 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 529f7309a1a2..89a1ad2243c8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2953,6 +2953,9 @@ void coro::salvageDebugInfo(
std::optional<BasicBlock::iterator> InsertPt;
if (auto *I = dyn_cast<Instruction>(Storage)) {
InsertPt = I->getInsertionPointAfterDef();
+ // Update DILocation only in O0 since it is easy to get out of sync in
+ // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
+ // an example.
if (!OptimizeFrame && I->getDebugLoc())
DVI.setDebugLoc(I->getDebugLoc());
} else if (isa<Argument>(Storage))
@@ -2988,9 +2991,14 @@ void coro::salvageDebugInfo(
// dbg.declare does.
if (DPV.getType() == DPValue::LocationType::Declare) {
std::optional<BasicBlock::iterator> InsertPt;
- if (auto *I = dyn_cast<Instruction>(Storage))
+ if (auto *I = dyn_cast<Instruction>(Storage)) {
InsertPt = I->getInsertionPointAfterDef();
- else if (isa<Argument>(Storage))
+ // Update DILocation only in O0 since it is easy to get out of sync in
+ // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
+ // an example.
+ if (!OptimizeFrame && I->getDebugLoc())
+ DPV.setDebugLoc(I->getDebugLoc());
+ } else if (isa<Argument>(Storage))
InsertPt = F->getEntryBlock().begin();
if (InsertPt) {
DPV.removeFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b2618e35b085..cc5a4ee8c2bd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -6725,10 +6725,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
LLVMContext &Ctx = AI.CB->getContext();
ObjectSizeOpts Opts;
ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
- SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
+ SizeOffsetValue SizeOffsetPair = Eval.compute(AI.CB);
assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() &&
- cast<ConstantInt>(SizeOffsetPair.second)->isZero());
- Size = SizeOffsetPair.first;
+ cast<ConstantInt>(SizeOffsetPair.Offset)->isZero());
+ Size = SizeOffsetPair.Size;
}
Instruction *IP =
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 556fde37efeb..96b612254ca5 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1666,13 +1666,6 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
if (Instruction *Ashr = foldAddToAshr(I))
return Ashr;
- // min(A, B) + max(A, B) => A + B.
- if (match(&I, m_CombineOr(m_c_Add(m_SMax(m_Value(A), m_Value(B)),
- m_c_SMin(m_Deferred(A), m_Deferred(B))),
- m_c_Add(m_UMax(m_Value(A), m_Value(B)),
- m_c_UMin(m_Deferred(A), m_Deferred(B))))))
- return BinaryOperator::CreateWithCopiedFlags(Instruction::Add, A, B, &I);
-
// (~X) + (~Y) --> -2 - (X + Y)
{
// To ensure we can save instructions we need to ensure that we consume both
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 43d4496571be..40b48699f758 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,10 +172,10 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
// If the memcpy has metadata describing the members, see if we can get the
// TBAA tag describing our copy.
- MDNode *CopyMD = nullptr;
- if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
- CopyMD = M;
- } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+ AAMDNodes AACopyMD = MI->getAAMetadata();
+
+ if (MDNode *M = AACopyMD.TBAAStruct) {
+ AACopyMD.TBAAStruct = nullptr;
if (M->getNumOperands() == 3 && M->getOperand(0) &&
mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
@@ -184,7 +184,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
Size &&
M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
- CopyMD = cast<MDNode>(M->getOperand(2));
+ AACopyMD.TBAA = cast<MDNode>(M->getOperand(2));
}
Value *Src = MI->getArgOperand(1);
@@ -192,8 +192,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
LoadInst *L = Builder.CreateLoad(IntType, Src);
// Alignment from the mem intrinsic will be better, so use it.
L->setAlignment(*CopySrcAlign);
- if (CopyMD)
- L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ L->setAAMetadata(AACopyMD);
MDNode *LoopMemParallelMD =
MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
if (LoopMemParallelMD)
@@ -205,8 +204,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
StoreInst *S = Builder.CreateStore(L, Dest);
// Alignment from the mem intrinsic will be better, so use it.
S->setAlignment(*CopyDstAlign);
- if (CopyMD)
- S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ S->setAAMetadata(AACopyMD);
if (LoopMemParallelMD)
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
if (AccessGroupMD)
@@ -1536,11 +1534,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
}
if (II->isCommutative()) {
- if (Instruction *I = foldCommutativeIntrinsicOverSelects(*II))
- return I;
-
- if (Instruction *I = foldCommutativeIntrinsicOverPhis(*II))
- return I;
+ if (auto Pair = matchSymmetricPair(II->getOperand(0), II->getOperand(1))) {
+ replaceOperand(*II, 0, Pair->first);
+ replaceOperand(*II, 1, Pair->second);
+ return II;
+ }
if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
return NewCall;
@@ -4246,39 +4244,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
Call.setCalledFunction(FTy, NestF);
return &Call;
}
-
-// op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y)
-Instruction *
-InstCombinerImpl::foldCommutativeIntrinsicOverSelects(IntrinsicInst &II) {
- assert(II.isCommutative());
-
- Value *A, *B, *C;
- if (match(II.getOperand(0), m_Select(m_Value(A), m_Value(B), m_Value(C))) &&
- match(II.getOperand(1),
- m_Select(m_Specific(A), m_Specific(C), m_Specific(B)))) {
- replaceOperand(II, 0, B);
- replaceOperand(II, 1, C);
- return &II;
- }
-
- return nullptr;
-}
-
-Instruction *
-InstCombinerImpl::foldCommutativeIntrinsicOverPhis(IntrinsicInst &II) {
- assert(II.isCommutative() && "Instruction should be commutative");
-
- PHINode *LHS = dyn_cast<PHINode>(II.getOperand(0));
- PHINode *RHS = dyn_cast<PHINode>(II.getOperand(1));
-
- if (!LHS || !RHS)
- return nullptr;
-
- if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
- replaceOperand(II, 0, P->first);
- replaceOperand(II, 1, P->second);
- return &II;
- }
-
- return nullptr;
-}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3875e59c3ede..7c1aff445524 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4920,8 +4920,9 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
}
}
- if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
- BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
+ if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() &&
+ (BO0->hasOneUse() || BO1->hasOneUse()) &&
+ BO0->getOperand(1) == BO1->getOperand(1)) {
switch (BO0->getOpcode()) {
default:
break;
@@ -5047,8 +5048,16 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
Value *Y = MinMax->getRHS();
if (ICmpInst::isSigned(Pred) && !MinMax->isSigned())
return nullptr;
- if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned())
- return nullptr;
+ if (ICmpInst::isUnsigned(Pred) && MinMax->isSigned()) {
+ // Revert the transform signed pred -> unsigned pred
+ // TODO: We can flip the signedness of predicate if both operands of icmp
+ // are negative.
+ if (isKnownNonNegative(Z, SQ.getWithInstruction(&I)) &&
+ isKnownNonNegative(MinMax, SQ.getWithInstruction(&I))) {
+ Pred = ICmpInst::getFlippedSignednessPredicate(Pred);
+ } else
+ return nullptr;
+ }
SimplifyQuery Q = SQ.getWithInstruction(&I);
auto IsCondKnownTrue = [](Value *Val) -> std::optional<bool> {
if (!Val)
@@ -6860,6 +6869,57 @@ Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred,
return foldICmpAddOpConst(X, *C, Pred);
}
+ // abs(X) >= X --> true
+ // abs(X) u<= X --> true
+ // abs(X) < X --> false
+ // abs(X) u> X --> false
+ // abs(X) u>= X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN`
+ // abs(X) <= X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN`
+ // abs(X) == X --> IsIntMinPosion ? `X > -1`: `X u<= INTMIN`
+ // abs(X) u< X --> IsIntMinPosion ? `X < 0` : `X > INTMIN`
+ // abs(X) > X --> IsIntMinPosion ? `X < 0` : `X > INTMIN`
+ // abs(X) != X --> IsIntMinPosion ? `X < 0` : `X > INTMIN`
+ {
+ Value *X;
+ Constant *C;
+ if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X), m_Constant(C))) &&
+ match(Op1, m_Specific(X))) {
+ Value *NullValue = Constant::getNullValue(X->getType());
+ Value *AllOnesValue = Constant::getAllOnesValue(X->getType());
+ const APInt SMin =
+ APInt::getSignedMinValue(X->getType()->getScalarSizeInBits());
+ bool IsIntMinPosion = C->isAllOnesValue();
+ switch (Pred) {
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_SGE:
+ return replaceInstUsesWith(CxtI, ConstantInt::getTrue(CxtI.getType()));
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_SLT:
+ return replaceInstUsesWith(CxtI, ConstantInt::getFalse(CxtI.getType()));
+ case CmpInst::ICMP_UGE:
+ case CmpInst::ICMP_SLE:
+ case CmpInst::ICMP_EQ: {
+ return replaceInstUsesWith(
+ CxtI, IsIntMinPosion
+ ? Builder.CreateICmpSGT(X, AllOnesValue)
+ : Builder.CreateICmpULT(
+ X, ConstantInt::get(X->getType(), SMin + 1)));
+ }
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_NE: {
+ return replaceInstUsesWith(
+ CxtI, IsIntMinPosion
+ ? Builder.CreateICmpSLT(X, NullValue)
+ : Builder.CreateICmpUGT(
+ X, ConstantInt::get(X->getType(), SMin)));
+ }
+ default:
+ llvm_unreachable("Invalid predicate!");
+ }
+ }
+ }
+
return nullptr;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index bdaf7550b4b4..21c61bd99018 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -276,17 +276,15 @@ private:
bool transformConstExprCastCall(CallBase &Call);
Instruction *transformCallThroughTrampoline(CallBase &Call,
IntrinsicInst &Tramp);
- Instruction *foldCommutativeIntrinsicOverSelects(IntrinsicInst &II);
- // Match a pair of Phi Nodes like
- // phi [a, BB0], [b, BB1] & phi [b, BB0], [a, BB1]
- // Return the matched two operands.
- std::optional<std::pair<Value *, Value *>>
- matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS);
-
- // Tries to fold (op phi(a, b) phi(b, a)) -> (op a, b)
- // while op is a commutative intrinsic call.
- Instruction *foldCommutativeIntrinsicOverPhis(IntrinsicInst &II);
+ // Return (a, b) if (LHS, RHS) is known to be (a, b) or (b, a).
+ // Otherwise, return std::nullopt
+ // Currently it matches:
+ // - LHS = (select c, a, b), RHS = (select c, b, a)
+ // - LHS = (phi [a, BB0], [b, BB1]), RHS = (phi [b, BB0], [a, BB1])
+ // - LHS = min(a, b), RHS = max(a, b)
+ std::optional<std::pair<Value *, Value *>> matchSymmetricPair(Value *LHS,
+ Value *RHS);
Value *simplifyMaskedLoad(IntrinsicInst &II);
Instruction *simplifyMaskedStore(IntrinsicInst &II);
@@ -502,11 +500,6 @@ public:
/// X % (C0 * C1)
Value *SimplifyAddWithRemainder(BinaryOperator &I);
- // Tries to fold (Binop phi(a, b) phi(b, a)) -> (Binop a, b)
- // while Binop is commutative.
- Value *SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, Value *LHS,
- Value *RHS);
-
// Binary Op helper for select operations where the expression can be
// efficiently reorganized.
Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f0ea3d9fcad5..e7f983a00e30 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -487,13 +487,6 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
return Res;
- // min(X, Y) * max(X, Y) => X * Y.
- if (match(&I, m_CombineOr(m_c_Mul(m_SMax(m_Value(X), m_Value(Y)),
- m_c_SMin(m_Deferred(X), m_Deferred(Y))),
- m_c_Mul(m_UMax(m_Value(X), m_Value(Y)),
- m_c_UMin(m_Deferred(X), m_Deferred(Y))))))
- return BinaryOperator::CreateWithCopiedFlags(Instruction::Mul, X, Y, &I);
-
// (mul Op0 Op1):
// if Log2(Op0) folds away ->
// (shl Op1, Log2(Op0))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 351fc3b0174f..7f2018b3a199 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -411,6 +411,14 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
getComplexity(I.getOperand(1)))
Changed = !I.swapOperands();
+ if (I.isCommutative()) {
+ if (auto Pair = matchSymmetricPair(I.getOperand(0), I.getOperand(1))) {
+ replaceOperand(I, 0, Pair->first);
+ replaceOperand(I, 1, Pair->second);
+ Changed = true;
+ }
+ }
+
BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
@@ -1096,8 +1104,8 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
}
-std::optional<std::pair<Value *, Value *>>
-InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
+static std::optional<std::pair<Value *, Value *>>
+matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
if (LHS->getParent() != RHS->getParent())
return std::nullopt;
@@ -1123,25 +1131,41 @@ InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
return std::optional(std::pair(L0, R0));
}
-Value *InstCombinerImpl::SimplifyPhiCommutativeBinaryOp(BinaryOperator &I,
- Value *Op0,
- Value *Op1) {
- assert(I.isCommutative() && "Instruction should be commutative");
-
- PHINode *LHS = dyn_cast<PHINode>(Op0);
- PHINode *RHS = dyn_cast<PHINode>(Op1);
-
- if (!LHS || !RHS)
- return nullptr;
-
- if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
- Value *BI = Builder.CreateBinOp(I.getOpcode(), P->first, P->second);
- if (auto *BO = dyn_cast<BinaryOperator>(BI))
- BO->copyIRFlags(&I);
- return BI;
+std::optional<std::pair<Value *, Value *>>
+InstCombinerImpl::matchSymmetricPair(Value *LHS, Value *RHS) {
+ Instruction *LHSInst = dyn_cast<Instruction>(LHS);
+ Instruction *RHSInst = dyn_cast<Instruction>(RHS);
+ if (!LHSInst || !RHSInst || LHSInst->getOpcode() != RHSInst->getOpcode())
+ return std::nullopt;
+ switch (LHSInst->getOpcode()) {
+ case Instruction::PHI:
+ return matchSymmetricPhiNodesPair(cast<PHINode>(LHS), cast<PHINode>(RHS));
+ case Instruction::Select: {
+ Value *Cond = LHSInst->getOperand(0);
+ Value *TrueVal = LHSInst->getOperand(1);
+ Value *FalseVal = LHSInst->getOperand(2);
+ if (Cond == RHSInst->getOperand(0) && TrueVal == RHSInst->getOperand(2) &&
+ FalseVal == RHSInst->getOperand(1))
+ return std::pair(TrueVal, FalseVal);
+ return std::nullopt;
+ }
+ case Instruction::Call: {
+ // Match min(a, b) and max(a, b)
+ MinMaxIntrinsic *LHSMinMax = dyn_cast<MinMaxIntrinsic>(LHSInst);
+ MinMaxIntrinsic *RHSMinMax = dyn_cast<MinMaxIntrinsic>(RHSInst);
+ if (LHSMinMax && RHSMinMax &&
+ LHSMinMax->getPredicate() ==
+ ICmpInst::getSwappedPredicate(RHSMinMax->getPredicate()) &&
+ ((LHSMinMax->getLHS() == RHSMinMax->getLHS() &&
+ LHSMinMax->getRHS() == RHSMinMax->getRHS()) ||
+ (LHSMinMax->getLHS() == RHSMinMax->getRHS() &&
+ LHSMinMax->getRHS() == RHSMinMax->getLHS())))
+ return std::pair(LHSMinMax->getLHS(), LHSMinMax->getRHS());
+ return std::nullopt;
+ }
+ default:
+ return std::nullopt;
}
-
- return nullptr;
}
Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
@@ -1187,14 +1211,6 @@ Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
};
if (LHSIsSelect && RHSIsSelect && A == D) {
- // op(select(%v, %x, %y), select(%v, %y, %x)) --> op(%x, %y)
- if (I.isCommutative() && B == F && C == E) {
- Value *BI = Builder.CreateBinOp(I.getOpcode(), B, E);
- if (auto *BO = dyn_cast<BinaryOperator>(BI))
- BO->copyIRFlags(&I);
- return BI;
- }
-
// (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
Cond = A;
True = simplifyBinOp(Opcode, B, E, FMF, Q);
@@ -1577,11 +1593,6 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
BO.getParent() != Phi1->getParent())
return nullptr;
- if (BO.isCommutative()) {
- if (Value *V = SimplifyPhiCommutativeBinaryOp(BO, Phi0, Phi1))
- return replaceInstUsesWith(BO, V);
- }
-
// Fold if there is at least one specific constant value in phi0 or phi1's
// incoming values that comes from the same block and this specific constant
// value can be used to do optimization for specific binary operator.
@@ -3197,6 +3208,64 @@ Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
return replaceOperand(SI, 0, Op0);
}
+ ConstantInt *SubLHS;
+ if (match(Cond, m_Sub(m_ConstantInt(SubLHS), m_Value(Op0)))) {
+ // Change 'switch (1-X) case 1:' into 'switch (X) case 0'.
+ for (auto Case : SI.cases()) {
+ Constant *NewCase = ConstantExpr::getSub(SubLHS, Case.getCaseValue());
+ assert(isa<ConstantInt>(NewCase) &&
+ "Result of expression should be constant");
+ Case.setValue(cast<ConstantInt>(NewCase));
+ }
+ return replaceOperand(SI, 0, Op0);
+ }
+
+ uint64_t ShiftAmt;
+ if (match(Cond, m_Shl(m_Value(Op0), m_ConstantInt(ShiftAmt))) &&
+ ShiftAmt < Op0->getType()->getScalarSizeInBits() &&
+ all_of(SI.cases(), [&](const auto &Case) {
+ return Case.getCaseValue()->getValue().countr_zero() >= ShiftAmt;
+ })) {
+ // Change 'switch (X << 2) case 4:' into 'switch (X) case 1:'.
+ OverflowingBinaryOperator *Shl = cast<OverflowingBinaryOperator>(Cond);
+ if (Shl->hasNoUnsignedWrap() || Shl->hasNoSignedWrap() ||
+ Shl->hasOneUse()) {
+ Value *NewCond = Op0;
+ if (!Shl->hasNoUnsignedWrap() && !Shl->hasNoSignedWrap()) {
+ // If the shift may wrap, we need to mask off the shifted bits.
+ unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+ NewCond = Builder.CreateAnd(
+ Op0, APInt::getLowBitsSet(BitWidth, BitWidth - ShiftAmt));
+ }
+ for (auto Case : SI.cases()) {
+ const APInt &CaseVal = Case.getCaseValue()->getValue();
+ APInt ShiftedCase = Shl->hasNoSignedWrap() ? CaseVal.ashr(ShiftAmt)
+ : CaseVal.lshr(ShiftAmt);
+ Case.setValue(ConstantInt::get(SI.getContext(), ShiftedCase));
+ }
+ return replaceOperand(SI, 0, NewCond);
+ }
+ }
+
+ // Fold switch(zext/sext(X)) into switch(X) if possible.
+ if (match(Cond, m_ZExtOrSExt(m_Value(Op0)))) {
+ bool IsZExt = isa<ZExtInst>(Cond);
+ Type *SrcTy = Op0->getType();
+ unsigned NewWidth = SrcTy->getScalarSizeInBits();
+
+ if (all_of(SI.cases(), [&](const auto &Case) {
+ const APInt &CaseVal = Case.getCaseValue()->getValue();
+ return IsZExt ? CaseVal.isIntN(NewWidth)
+ : CaseVal.isSignedIntN(NewWidth);
+ })) {
+ for (auto &Case : SI.cases()) {
+ APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+ Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+ }
+ return replaceOperand(SI, 0, Op0);
+ }
+ }
+
KnownBits Known = computeKnownBits(Cond, 0, &SI);
unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index afb0e6cd1548..e3deafa49bd9 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -174,6 +174,8 @@ const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison";
const char kAMDGPUAddressSharedName[] = "llvm.amdgcn.is.shared";
const char kAMDGPUAddressPrivateName[] = "llvm.amdgcn.is.private";
+const char kAMDGPUBallotName[] = "llvm.amdgcn.ballot.i64";
+const char kAMDGPUUnreachableName[] = "llvm.amdgcn.unreachable";
// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
static const size_t kNumberOfAccessSizes = 5;
@@ -699,6 +701,8 @@ struct AddressSanitizer {
Instruction *InsertBefore, Value *Addr,
uint32_t TypeStoreSize, bool IsWrite,
Value *SizeArgument);
+ Instruction *genAMDGPUReportBlock(IRBuilder<> &IRB, Value *Cond,
+ bool Recover);
void instrumentUnusualSizeOrAlignment(Instruction *I,
Instruction *InsertBefore, Value *Addr,
TypeSize TypeStoreSize, bool IsWrite,
@@ -1721,6 +1725,30 @@ Instruction *AddressSanitizer::instrumentAMDGPUAddress(
return InsertBefore;
}
+Instruction *AddressSanitizer::genAMDGPUReportBlock(IRBuilder<> &IRB,
+ Value *Cond, bool Recover) {
+ Module &M = *IRB.GetInsertBlock()->getModule();
+ Value *ReportCond = Cond;
+ if (!Recover) {
+ auto Ballot = M.getOrInsertFunction(kAMDGPUBallotName, IRB.getInt64Ty(),
+ IRB.getInt1Ty());
+ ReportCond = IRB.CreateIsNotNull(IRB.CreateCall(Ballot, {Cond}));
+ }
+
+ auto *Trm =
+ SplitBlockAndInsertIfThen(ReportCond, &*IRB.GetInsertPoint(), false,
+ MDBuilder(*C).createBranchWeights(1, 100000));
+ Trm->getParent()->setName("asan.report");
+
+ if (Recover)
+ return Trm;
+
+ Trm = SplitBlockAndInsertIfThen(Cond, Trm, false);
+ IRB.SetInsertPoint(Trm);
+ return IRB.CreateCall(
+ M.getOrInsertFunction(kAMDGPUUnreachableName, IRB.getVoidTy()), {});
+}
+
void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
Instruction *InsertBefore, Value *Addr,
MaybeAlign Alignment,
@@ -1772,7 +1800,15 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
size_t Granularity = 1ULL << Mapping.Scale;
Instruction *CrashTerm = nullptr;
- if (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity)) {
+ bool GenSlowPath = (ClAlwaysSlowPath || (TypeStoreSize < 8 * Granularity));
+
+ if (TargetTriple.isAMDGCN()) {
+ if (GenSlowPath) {
+ auto *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeStoreSize);
+ Cmp = IRB.CreateAnd(Cmp, Cmp2);
+ }
+ CrashTerm = genAMDGPUReportBlock(IRB, Cmp, Recover);
+ } else if (GenSlowPath) {
// We use branch weights for the slow path check, to indicate that the slow
// path is rarely taken. This seems to be the case for SPEC benchmarks.
Instruction *CheckTerm = SplitBlockAndInsertIfThen(
@@ -3629,10 +3665,14 @@ bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
// TODO: We can use vscale_range to convert a scalable value to an
// upper bound on the access size.
return false;
- SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
- if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
- uint64_t Size = SizeOffset.first.getZExtValue();
- int64_t Offset = SizeOffset.second.getSExtValue();
+
+ SizeOffsetAPInt SizeOffset = ObjSizeVis.compute(Addr);
+ if (!SizeOffset.bothKnown())
+ return false;
+
+ uint64_t Size = SizeOffset.Size.getZExtValue();
+ int64_t Offset = SizeOffset.Offset.getSExtValue();
+
// Three checks are required to ensure safety:
// . Offset >= 0 (since the offset is given from the base ptr)
// . Size >= Offset (unsigned)
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index ee5b81960417..cfa8ae26c625 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -61,15 +61,15 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
<< " bytes\n");
- SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
+ SizeOffsetValue SizeOffset = ObjSizeEval.compute(Ptr);
- if (!ObjSizeEval.bothKnown(SizeOffset)) {
+ if (!SizeOffset.bothKnown()) {
++ChecksUnable;
return nullptr;
}
- Value *Size = SizeOffset.first;
- Value *Offset = SizeOffset.second;
+ Value *Size = SizeOffset.Size;
+ Value *Offset = SizeOffset.Offset;
ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
Type *IndexTy = DL.getIndexType(Ptr->getType());
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe5a0578bd97..a19b14087254 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1189,12 +1189,10 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) {
}
static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
- // Don't do this for Darwin. compiler-rt uses linker magic.
- if (TT.isOSDarwin())
- return false;
- // Use linker script magic to get data/cnts/name start/end.
- if (TT.isOSAIX() || TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
- TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS() || TT.isOSWindows())
+ // compiler-rt uses linker support to get data/counters/name start/end for
+ // ELF, COFF, Mach-O and XCOFF.
+ if (TT.isOSBinFormatELF() || TT.isOSBinFormatCOFF() ||
+ TT.isOSBinFormatMachO() || TT.isOSBinFormatXCOFF())
return false;
return true;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 3a57709c4e8b..6b95c7028d93 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -330,10 +330,6 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
} // namespace llvm
-static cl::opt<bool>
- PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden,
- cl::desc("Use the old CFG function hashing"));
-
// Return a string describing the branch condition that can be
// used in static branch probability heuristics:
static std::string getBranchCondString(Instruction *TI) {
@@ -635,34 +631,25 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
JC.update(Indexes);
JamCRC JCH;
- if (PGOOldCFGHashing) {
- // Hash format for context sensitive profile. Reserve 4 bits for other
- // information.
- FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
- (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
- //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 |
- (uint64_t)MST.numEdges() << 32 | JC.getCRC();
+ // The higher 32 bits.
+ auto updateJCH = [&JCH](uint64_t Num) {
+ uint8_t Data[8];
+ support::endian::write64le(Data, Num);
+ JCH.update(Data);
+ };
+ updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts());
+ updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size());
+ updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size());
+ if (BCI) {
+ updateJCH(BCI->getInstrumentedBlocksHash());
} else {
- // The higher 32 bits.
- auto updateJCH = [&JCH](uint64_t Num) {
- uint8_t Data[8];
- support::endian::write64le(Data, Num);
- JCH.update(Data);
- };
- updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts());
- updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size());
- updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size());
- if (BCI) {
- updateJCH(BCI->getInstrumentedBlocksHash());
- } else {
- updateJCH((uint64_t)MST.numEdges());
- }
-
- // Hash format for context sensitive profile. Reserve 4 bits for other
- // information.
- FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
+ updateJCH((uint64_t)MST.numEdges());
}
+ // Hash format for context sensitive profile. Reserve 4 bits for other
+ // information.
+ FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
+
// Reserve bit 60-63 for other information purpose.
FunctionHash &= 0x0FFFFFFFFFFFFFFF;
if (IsCS)
@@ -672,10 +659,8 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
<< ", Selects = " << SIVisitor.getNumOfSelectInsts()
<< ", Edges = " << MST.numEdges() << ", ICSites = "
<< ValueSites[IPVK_IndirectCallTarget].size());
- if (!PGOOldCFGHashing) {
- LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size()
- << ", High32 CRC = " << JCH.getCRC());
- }
+ LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size()
+ << ", High32 CRC = " << JCH.getCRC());
LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";);
if (PGOTraceFuncHash != "-" && F.getName().contains(PGOTraceFuncHash))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 06c87bd6dc37..6fec54ac7922 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -517,6 +517,18 @@ static Decomposition decompose(Value *V,
return Result;
}
+ // (shl nsw x, shift) is (mul nsw x, (1<<shift)), with the exception of
+ // shift == bw-1.
+ if (match(V, m_NSWShl(m_Value(Op0), m_ConstantInt(CI)))) {
+ uint64_t Shift = CI->getValue().getLimitedValue();
+ if (Shift < Ty->getIntegerBitWidth() - 1) {
+ assert(Shift < 64 && "Would overflow");
+ auto Result = decompose(Op0, Preconditions, IsSigned, DL);
+ Result.mul(int64_t(1) << Shift);
+ return Result;
+ }
+ }
+
return V;
}
@@ -644,7 +656,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
// First try to look up \p V in Value2Index and NewVariables. Otherwise add a
// new entry to NewVariables.
- DenseMap<Value *, unsigned> NewIndexMap;
+ SmallDenseMap<Value *, unsigned> NewIndexMap;
auto GetOrAddIndex = [&Value2Index, &NewVariables,
&NewIndexMap](Value *V) -> unsigned {
auto V2I = Value2Index.find(V);
@@ -668,7 +680,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
IsSigned, IsEq, IsNe);
// Collect variables that are known to be positive in all uses in the
// constraint.
- DenseMap<Value *, bool> KnownNonNegativeVariables;
+ SmallDenseMap<Value *, bool> KnownNonNegativeVariables;
auto &R = Res.Coefficients;
for (const auto &KV : VariablesA) {
R[GetOrAddIndex(KV.Variable)] += KV.Coefficient;
@@ -921,15 +933,20 @@ void State::addInfoForInductions(BasicBlock &BB) {
}
DomTreeNode *DTN = DT.getNode(InLoopSucc);
- auto Inc = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT);
- bool MonotonicallyIncreasing =
- Inc && *Inc == ScalarEvolution::MonotonicallyIncreasing;
- if (MonotonicallyIncreasing) {
- // SCEV guarantees that AR does not wrap, so PN >= StartValue can be added
- // unconditionally.
+ auto IncUnsigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT);
+ auto IncSigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_SGT);
+ bool MonotonicallyIncreasingUnsigned =
+ IncUnsigned && *IncUnsigned == ScalarEvolution::MonotonicallyIncreasing;
+ bool MonotonicallyIncreasingSigned =
+ IncSigned && *IncSigned == ScalarEvolution::MonotonicallyIncreasing;
+ // If SCEV guarantees that AR does not wrap, PN >= StartValue can be added
+ // unconditionally.
+ if (MonotonicallyIncreasingUnsigned)
WorkList.push_back(
FactOrCheck::getConditionFact(DTN, CmpInst::ICMP_UGE, PN, StartValue));
- }
+ if (MonotonicallyIncreasingSigned)
+ WorkList.push_back(
+ FactOrCheck::getConditionFact(DTN, CmpInst::ICMP_SGE, PN, StartValue));
APInt StepOffset;
if (auto *C = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
@@ -953,11 +970,17 @@ void State::addInfoForInductions(BasicBlock &BB) {
WorkList.push_back(FactOrCheck::getConditionFact(
DTN, CmpInst::ICMP_UGE, StartValue, PN,
ConditionTy(CmpInst::ICMP_ULE, B, StartValue)));
+ WorkList.push_back(FactOrCheck::getConditionFact(
+ DTN, CmpInst::ICMP_SGE, StartValue, PN,
+ ConditionTy(CmpInst::ICMP_SLE, B, StartValue)));
// Add PN > B conditional on B <= StartValue which guarantees that the loop
// exits when reaching B with a step of -1.
WorkList.push_back(FactOrCheck::getConditionFact(
DTN, CmpInst::ICMP_UGT, PN, B,
ConditionTy(CmpInst::ICMP_ULE, B, StartValue)));
+ WorkList.push_back(FactOrCheck::getConditionFact(
+ DTN, CmpInst::ICMP_SGT, PN, B,
+ ConditionTy(CmpInst::ICMP_SLE, B, StartValue)));
return;
}
@@ -968,37 +991,31 @@ void State::addInfoForInductions(BasicBlock &BB) {
return;
if (!StepOffset.isOne()) {
- auto *UpperGEP = dyn_cast<GetElementPtrInst>(B);
- if (!UpperGEP || UpperGEP->getPointerOperand() != StartValue ||
- !UpperGEP->isInBounds())
- return;
-
- MapVector<Value *, APInt> UpperVariableOffsets;
- APInt UpperConstantOffset(StepOffset.getBitWidth(), 0);
- const DataLayout &DL = BB.getModule()->getDataLayout();
- if (!UpperGEP->collectOffset(DL, StepOffset.getBitWidth(),
- UpperVariableOffsets, UpperConstantOffset))
- return;
- // All variable offsets and the constant offset have to be a multiple of the
- // step.
- if (!UpperConstantOffset.urem(StepOffset).isZero() ||
- any_of(UpperVariableOffsets, [&StepOffset](const auto &P) {
- return !P.second.urem(StepOffset).isZero();
- }))
+ // Check whether B-Start is known to be a multiple of StepOffset.
+ const SCEV *BMinusStart = SE.getMinusSCEV(SE.getSCEV(B), StartSCEV);
+ if (isa<SCEVCouldNotCompute>(BMinusStart) ||
+ !SE.getConstantMultiple(BMinusStart).urem(StepOffset).isZero())
return;
}
// AR may wrap. Add PN >= StartValue conditional on StartValue <= B which
// guarantees that the loop exits before wrapping in combination with the
// restrictions on B and the step above.
- if (!MonotonicallyIncreasing) {
+ if (!MonotonicallyIncreasingUnsigned)
WorkList.push_back(FactOrCheck::getConditionFact(
DTN, CmpInst::ICMP_UGE, PN, StartValue,
ConditionTy(CmpInst::ICMP_ULE, StartValue, B)));
- }
+ if (!MonotonicallyIncreasingSigned)
+ WorkList.push_back(FactOrCheck::getConditionFact(
+ DTN, CmpInst::ICMP_SGE, PN, StartValue,
+ ConditionTy(CmpInst::ICMP_SLE, StartValue, B)));
+
WorkList.push_back(FactOrCheck::getConditionFact(
DTN, CmpInst::ICMP_ULT, PN, B,
ConditionTy(CmpInst::ICMP_ULE, StartValue, B)));
+ WorkList.push_back(FactOrCheck::getConditionFact(
+ DTN, CmpInst::ICMP_SLT, PN, B,
+ ConditionTy(CmpInst::ICMP_SLE, StartValue, B)));
}
void State::addInfoFor(BasicBlock &BB) {
@@ -1655,15 +1672,14 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
DFSInStack);
}
- LLVM_DEBUG(dbgs() << "Processing ");
-
// For a block, check if any CmpInsts become known based on the current set
// of constraints.
if (CB.isCheck()) {
Instruction *Inst = CB.getInstructionToSimplify();
if (!Inst)
continue;
- LLVM_DEBUG(dbgs() << "condition to simplify: " << *Inst << "\n");
+ LLVM_DEBUG(dbgs() << "Processing condition to simplify: " << *Inst
+ << "\n");
if (auto *II = dyn_cast<WithOverflowInst>(Inst)) {
Changed |= tryToSimplifyOverflowMath(II, Info, ToRemove);
} else if (auto *Cmp = dyn_cast<ICmpInst>(Inst)) {
@@ -1682,7 +1698,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
}
auto AddFact = [&](CmpInst::Predicate Pred, Value *A, Value *B) {
- LLVM_DEBUG(dbgs() << "fact to add to the system: ";
+ LLVM_DEBUG(dbgs() << "Processing fact to add to the system: ";
dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "\n");
if (Info.getCS(CmpInst::isSigned(Pred)).size() > MaxRows) {
LLVM_DEBUG(
@@ -1731,8 +1747,17 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
A = CB.Cond.Op0;
B = CB.Cond.Op1;
if (CB.DoesHold.Pred != CmpInst::BAD_ICMP_PREDICATE &&
- !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1))
+ !Info.doesHold(CB.DoesHold.Pred, CB.DoesHold.Op0, CB.DoesHold.Op1)) {
+ LLVM_DEBUG({
+ dbgs() << "Not adding fact ";
+ dumpUnpackedICmp(dbgs(), Pred, A, B);
+ dbgs() << " because precondition ";
+ dumpUnpackedICmp(dbgs(), CB.DoesHold.Pred, CB.DoesHold.Op0,
+ CB.DoesHold.Op1);
+ dbgs() << " does not hold.\n";
+ });
continue;
+ }
} else {
bool Matched = match(CB.Inst, m_Intrinsic<Intrinsic::assume>(
m_ICmp(Pred, m_Value(A), m_Value(B))));
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index c44d3748a80d..9235850de92f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -94,6 +94,31 @@ STATISTIC(NumUDivURemsNarrowedExpanded,
"Number of bound udiv's/urem's expanded");
STATISTIC(NumZExt, "Number of non-negative deductions");
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
+ if (Constant *C = LVI->getConstant(V, At))
+ return C;
+
+ // TODO: The following really should be sunk inside LVI's core algorithm, or
+ // at least the outer shims around such.
+ auto *C = dyn_cast<CmpInst>(V);
+ if (!C)
+ return nullptr;
+
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1)
+ return nullptr;
+
+ LazyValueInfo::Tristate Result = LVI->getPredicateAt(
+ C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false);
+ if (Result == LazyValueInfo::Unknown)
+ return nullptr;
+
+ return (Result == LazyValueInfo::True)
+ ? ConstantInt::getTrue(C->getContext())
+ : ConstantInt::getFalse(C->getContext());
+}
+
static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy() || isa<Constant>(S->getCondition()))
return false;
@@ -106,7 +131,7 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
C = LVI->getConstantOnEdge(S->getCondition(), PN->getIncomingBlock(U),
I->getParent(), I);
else
- C = LVI->getConstant(S->getCondition(), I);
+ C = getConstantAt(S->getCondition(), I, LVI);
auto *CI = dyn_cast_or_null<ConstantInt>(C);
if (!CI)
@@ -1109,30 +1134,6 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
return true;
}
-
-static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
- if (Constant *C = LVI->getConstant(V, At))
- return C;
-
- // TODO: The following really should be sunk inside LVI's core algorithm, or
- // at least the outer shims around such.
- auto *C = dyn_cast<CmpInst>(V);
- if (!C) return nullptr;
-
- Value *Op0 = C->getOperand(0);
- Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
- if (!Op1) return nullptr;
-
- LazyValueInfo::Tristate Result = LVI->getPredicateAt(
- C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false);
- if (Result == LazyValueInfo::Unknown)
- return nullptr;
-
- return (Result == LazyValueInfo::True) ?
- ConstantInt::getTrue(C->getContext()) :
- ConstantInt::getFalse(C->getContext());
-}
-
static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
const SimplifyQuery &SQ) {
bool FnChanged = false;
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
index 656abdb0abbf..75cddfa16d6d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1097,10 +1097,8 @@ private:
// For array or vector indices, scale the index by the size of the
// type.
APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
- GEPOffset +=
- Index *
- APInt(Offset.getBitWidth(),
- DL.getTypeAllocSize(GTI.getIndexedType()).getFixedValue());
+ GEPOffset += Index * APInt(Offset.getBitWidth(),
+ GTI.getSequentialElementStride(DL));
}
// If this index has computed an intermediate pointer which is not
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index b8c9d9d100f1..225dd454068c 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -843,7 +843,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
// constant offset to a byte offset, and later offset the remainder of
// the original GEP with this byte offset.
AccumulativeByteOffset +=
- ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+ ConstantOffset * GTI.getSequentialElementStride(*DL);
}
} else if (LowerGEP) {
StructType *StTy = GTI.getStructType();
@@ -884,7 +884,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
continue;
APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(),
- DL->getTypeAllocSize(GTI.getIndexedType()));
+ GTI.getSequentialElementStride(*DL));
// Scale the index by element size.
if (ElementSize != 1) {
if (ElementSize.isPowerOf2()) {
@@ -946,7 +946,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
continue;
APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
- DL->getTypeAllocSize(GTI.getIndexedType()));
+ GTI.getSequentialElementStride(*DL));
// Scale the index by element size.
if (ElementSize != 1) {
if (ElementSize.isPowerOf2()) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 543469d62fe7..ca1f3a0c0ae3 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -547,7 +547,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
// indices except this current one.
const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
Value *ArrayIdx = GEP->getOperand(I);
- uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+ uint64_t ElementSize = GTI.getSequentialElementStride(*DL);
if (ArrayIdx->getType()->getIntegerBitWidth() <=
DL->getIndexSizeInBits(GEP->getAddressSpace())) {
// Skip factoring if ArrayIdx is wider than the index size, because
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 0990c750af55..ea3135630665 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -33,37 +33,37 @@ STATISTIC(NumVFDeclAdded,
STATISTIC(NumCompUsedAdded,
"Number of `@llvm.compiler.used` operands that have been added.");
-/// A helper function that adds the vector function declaration that
-/// vectorizes the CallInst CI with a vectorization factor of VF
-/// lanes. The TLI assumes that all parameters and the return type of
-/// CI (other than void) need to be widened to a VectorType of VF
-/// lanes.
+/// A helper function that adds the vector variant declaration for vectorizing
+/// the CallInst \p CI with a vectorization factor of \p VF lanes. For each
+/// mapping, TLI provides a VABI prefix, which contains all information required
+/// to create vector function declaration.
static void addVariantDeclaration(CallInst &CI, const ElementCount &VF,
- bool Predicate, const StringRef VFName) {
+ const VecDesc *VD) {
Module *M = CI.getModule();
+ FunctionType *ScalarFTy = CI.getFunctionType();
- // Add function declaration.
- Type *RetTy = ToVectorTy(CI.getType(), VF);
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI.args())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
- assert(!CI.getFunctionType()->isVarArg() &&
- "VarArg functions are not supported.");
- if (Predicate)
- Tys.push_back(ToVectorTy(Type::getInt1Ty(RetTy->getContext()), VF));
- FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
- Function *VectorF =
- Function::Create(FTy, Function::ExternalLinkage, VFName, M);
- VectorF->copyAttributesFrom(CI.getCalledFunction());
+ assert(!ScalarFTy->isVarArg() && "VarArg functions are not supported.");
+
+ const std::optional<VFInfo> Info = VFABI::tryDemangleForVFABI(
+ VD->getVectorFunctionABIVariantString(), ScalarFTy);
+
+ assert(Info && "Failed to demangle vector variant");
+ assert(Info->Shape.VF == VF && "Mangled name does not match VF");
+
+ const StringRef VFName = VD->getVectorFnName();
+ FunctionType *VectorFTy = VFABI::createFunctionType(*Info, ScalarFTy);
+ Function *VecFunc =
+ Function::Create(VectorFTy, Function::ExternalLinkage, VFName, M);
+ VecFunc->copyAttributesFrom(CI.getCalledFunction());
++NumVFDeclAdded;
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
- << "` of type " << *(VectorF->getType()) << "\n");
+ << "` of type " << *VectorFTy << "\n");
// Make function declaration (without a body) "sticky" in the IR by
// listing it in the @llvm.compiler.used intrinsic.
- assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
+ assert(!VecFunc->size() && "VFABI attribute requires `@llvm.compiler.used` "
"only on declarations.");
- appendToCompilerUsed(*M, {VectorF});
+ appendToCompilerUsed(*M, {VecFunc});
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
<< "` to `@llvm.compiler.used`.\n");
++NumCompUsedAdded;
@@ -100,7 +100,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
}
Function *VariantF = M->getFunction(VD->getVectorFnName());
if (!VariantF)
- addVariantDeclaration(CI, VF, Predicate, VD->getVectorFnName());
+ addVariantDeclaration(CI, VF, VD);
}
};
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index ab95698abc43..3dc6016a0a37 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -310,6 +310,7 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
}
+ DefaultDest->removePredecessor(BB);
SI->setDefaultDest(NewUnreachableBB);
Updates.push_back({DominatorTree::Delete, BB, DefaultDest});
Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB});
@@ -1063,14 +1064,17 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
// is ready.
if (SCValue.isConstantRange(/*UndefAllowed=*/false)) {
const ConstantRange &Range = SCValue.getConstantRange();
+ unsigned ReachableCaseCount = 0;
for (const auto &Case : SI->cases()) {
const APInt &CaseValue = Case.getCaseValue()->getValue();
- if (Range.contains(CaseValue))
+ if (Range.contains(CaseValue)) {
Succs[Case.getSuccessorIndex()] = true;
+ ++ReachableCaseCount;
+ }
}
- // TODO: Determine whether default case is reachable.
- Succs[SI->case_default()->getSuccessorIndex()] = true;
+ Succs[SI->case_default()->getSuccessorIndex()] =
+ Range.isSizeLargerThan(ReachableCaseCount);
return;
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 55e375670cc6..61d891d65346 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5414,11 +5414,13 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
}
static void createUnreachableSwitchDefault(SwitchInst *Switch,
- DomTreeUpdater *DTU) {
+ DomTreeUpdater *DTU,
+ bool RemoveOrigDefaultBlock = true) {
LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
auto *BB = Switch->getParent();
auto *OrigDefaultBlock = Switch->getDefaultDest();
- OrigDefaultBlock->removePredecessor(BB);
+ if (RemoveOrigDefaultBlock)
+ OrigDefaultBlock->removePredecessor(BB);
BasicBlock *NewDefaultBlock = BasicBlock::Create(
BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
OrigDefaultBlock);
@@ -5427,7 +5429,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
if (DTU) {
SmallVector<DominatorTree::UpdateType, 2> Updates;
Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
- if (!is_contained(successors(BB), OrigDefaultBlock))
+ if (RemoveOrigDefaultBlock &&
+ !is_contained(successors(BB), OrigDefaultBlock))
Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
DTU->applyUpdates(Updates);
}
@@ -5609,10 +5612,28 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
Known.getBitWidth() - (Known.Zero | Known.One).popcount();
assert(NumUnknownBits <= Known.getBitWidth());
if (HasDefault && DeadCases.empty() &&
- NumUnknownBits < 64 /* avoid overflow */ &&
- SI->getNumCases() == (1ULL << NumUnknownBits)) {
- createUnreachableSwitchDefault(SI, DTU);
- return true;
+ NumUnknownBits < 64 /* avoid overflow */) {
+ uint64_t AllNumCases = 1ULL << NumUnknownBits;
+ if (SI->getNumCases() == AllNumCases) {
+ createUnreachableSwitchDefault(SI, DTU);
+ return true;
+ }
+ // When only one case value is missing, replace default with that case.
+ // Eliminating the default branch will provide more opportunities for
+ // optimization, such as lookup tables.
+ if (SI->getNumCases() == AllNumCases - 1) {
+ assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+ uint64_t MissingCaseVal = 0;
+ for (const auto &Case : SI->cases())
+ MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+ auto *MissingCase =
+ cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
+ createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
+ SIW.setSuccessorWeight(0, 0);
+ return true;
+ }
}
if (DeadCases.empty())
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 760a626c8b6f..a7cd68e860e4 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -3735,26 +3735,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
// Also try to simplify calls to fortified library functions.
if (Value *SimplifiedFortifiedCI =
- FortifiedSimplifier.optimizeCall(CI, Builder)) {
- // Try to further simplify the result.
- CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
- if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
- // Ensure that SimplifiedCI's uses are complete, since some calls have
- // their uses analyzed.
- replaceAllUsesWith(CI, SimplifiedCI);
-
- // Set insertion point to SimplifiedCI to guarantee we reach all uses
- // we might replace later on.
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(SimplifiedCI);
- if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
- // If we were able to further simplify, remove the now redundant call.
- substituteInParent(SimplifiedCI, V);
- return V;
- }
- }
+ FortifiedSimplifier.optimizeCall(CI, Builder))
return SimplifiedFortifiedCI;
- }
// Then check for known library functions.
if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index fa2459d1ca02..1f11d4894f77 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1193,7 +1193,7 @@ std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
OpA->getType() != OpB->getType())
return std::nullopt;
- uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ uint64_t Stride = GTIA.getSequentialElementStride(DL);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 577ce8000de2..cff72ae263d8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,9 +167,14 @@ public:
}
VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
- DebugLoc DL, const Twine &Name = "") {
- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
- Name);
+ DebugLoc DL, const Twine &Name = "",
+ std::optional<FastMathFlags> FMFs = std::nullopt) {
+ auto *Select =
+ FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+ *FMFs, DL, Name)
+ : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+ DL, Name);
+ return tryInsertInstruction(Select);
}
/// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
@@ -341,16 +346,20 @@ public:
/// Return the best VPlan for \p VF.
VPlan &getBestPlanFor(ElementCount VF) const;
- /// Generate the IR code for the body of the vectorized loop according to the
- /// best selected \p VF, \p UF and VPlan \p BestPlan.
+ /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
+ /// according to the best selected \p VF and \p UF.
+ ///
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
/// vectorization re-using plans for both the main and epilogue vector loops.
/// It should be removed once the re-use issue has been fixed.
/// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
- /// to re-use expansion results generated during main plan execution. Returns
- /// a mapping of SCEVs to their expanded IR values. Note that this is a
- /// temporary workaround needed due to the current epilogue handling.
- DenseMap<const SCEV *, Value *>
+ /// to re-use expansion results generated during main plan execution.
+ ///
+ /// Returns a mapping of SCEVs to their expanded IR values and a mapping for
+ /// the reduction resume values. Note that this is a temporary workaround
+ /// needed due to the current epilogue handling.
+ std::pair<DenseMap<const SCEV *, Value *>,
+ DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
InnerLoopVectorizer &LB, DominatorTree *DT,
bool IsEpilogueVectorization,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8e135d80f4f2..51ce88480c08 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -584,10 +584,6 @@ public:
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
- // Returns the resume value (bc.merge.rdx) for a reduction as
- // generated by fixReduction.
- PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
-
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
/// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -626,9 +622,6 @@ protected:
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan, VPTransformState &State);
- /// Handle all cross-iteration phis in the header.
- void fixCrossIterationPHIs(VPTransformState &State);
-
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
@@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
}
-PHINode *InnerLoopVectorizer::getReductionResumeValue(
- const RecurrenceDescriptor &RdxDesc) {
- auto It = ReductionResumeValues.find(&RdxDesc);
- assert(It != ReductionResumeValues.end() &&
- "Expected to find a resume value for the reduction.");
- return It->second;
-}
-
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences.
- fixCrossIterationPHIs(State);
+ // This is the second stage of vectorizing recurrences. Note that fixing
+ // reduction phis are already modeled in VPlan.
+ // TODO: Also model fixing fixed-order recurrence phis in VPlan.
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
+ for (VPRecipeBase &R : HeaderVPBB->phis()) {
+ if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+ fixFixedOrderRecurrence(FOR, State);
+ }
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
@@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
- VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
@@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #2: We now need to fix the recurrences by adding incoming edges to
- // the currently empty PHI nodes. At this point every instruction in the
- // original loop is widened to a vector form so we can use them to construct
- // the incoming edges.
- VPBasicBlock *Header =
- State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- fixReduction(ReductionPhi, State);
- }
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFixedOrderRecurrence(FOR, State);
- }
-}
-
void InnerLoopVectorizer::fixFixedOrderRecurrence(
VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
@@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Phi->setName("scalar.recur");
}
-void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
- VPTransformState &State) {
- PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
- // Get it's reduction variable descriptor.
- assert(Legal->isReductionVariable(OrigPhi) &&
- "Unable to find the reduction variable");
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
- RecurKind RK = RdxDesc.getRecurrenceKind();
- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
- Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
- State.setDebugLocFrom(I->getDebugLoc());
-
- VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
-
- // Before each round, move the insertion point right between
- // the PHIs and the values we are going to write.
- // This allows us to write both PHINodes and the extractelement
- // instructions.
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
-
- State.setDebugLocFrom(LoopExitInst->getDebugLoc());
-
- Type *PhiTy = OrigPhi->getType();
- // If tail is folded by masking, the vector value to leave the loop should be
- // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
- // instead of the former. For an inloop reduction the reduction will already
- // be predicated, and does not need to be handled here.
- if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
- VPValue *Def = nullptr;
- for (VPUser *U : LoopExitInstDef->users()) {
- auto *S = dyn_cast<VPInstruction>(U);
- if (S && S->getOpcode() == Instruction::Select) {
- Def = S;
- break;
- }
- }
- if (Def)
- LoopExitInstDef = Def;
- }
-
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- RdxParts[Part] = State.get(LoopExitInstDef, Part);
-
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
- if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
- Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- }
- }
-
- // Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = RdxParts[0];
- unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
- // The middle block terminator has already been assigned a DebugLoc here (the
- // OrigLoop's single latch terminator). We want the whole middle block to
- // appear to execute on this line because: (a) it is all compiler generated,
- // (b) these instructions are always executed after evaluating the latch
- // conditional branch, and (c) other passes may add new predecessors which
- // terminate on this line. This is the easiest way to ensure we don't
- // accidentally cause an extra step back into the loop while debugging.
- State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
- if (PhiR->isOrdered())
- ReducedPartRdx = RdxParts[UF - 1];
- else {
- // Floating-point operations should have some FMF to enable the reduction.
- IRBuilderBase::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
- for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = RdxParts[Part];
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- ReducedPartRdx = Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
- ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
- ReducedPartRdx, RdxPart);
- else
- ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- }
- }
-
- // Create the reduction after the loop. Note that inloop reductions create the
- // target reduction in the loop using a Reduction recipe.
- if (VF.isVector() && !PhiR->isInLoop()) {
- ReducedPartRdx =
- createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
- // If the reduction can be performed in a smaller type, we need to extend
- // the reduction to the wider type before we branch to the original loop.
- if (PhiTy != RdxDesc.getRecurrenceType())
- ReducedPartRdx = RdxDesc.isSigned()
- ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
- : Builder.CreateZExt(ReducedPartRdx, PhiTy);
- }
-
- PHINode *ResumePhi =
- dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-
- // Create a phi node that merges control-flow from the backedge-taken check
- // block and the middle block.
- PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
- LoopScalarPreHeader->getTerminator());
-
- // If we are fixing reductions in the epilogue loop then we should already
- // have created a bc.merge.rdx Phi after the main vector body. Ensure that
- // we carry over the incoming values correctly.
- for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
- if (Incoming == LoopMiddleBlock)
- BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
- else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
- BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
- Incoming);
- else
- BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
- }
-
- // Set the resume value for this reduction
- ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
-
- // If there were stores of the reduction value to a uniform memory address
- // inside the loop, create the final store here.
- if (StoreInst *SI = RdxDesc.IntermediateStore) {
- StoreInst *NewSI =
- Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
- SI->getAlign());
- propagateMetadata(NewSI, SI);
-
- // If the reduction value is used in other places,
- // then let the code below create PHI's for that.
- }
-
- // Now, we need to fix the users of the reduction variable
- // inside and outside of the scalar remainder loop.
-
- // We know that the loop is in LCSSA form. We need to update the PHI nodes
- // in the exit blocks. See comment on analogous loop in
- // fixFixedOrderRecurrence for a more complete explaination of the logic.
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
- LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
- State.Plan->removeLiveOut(&LCSSAPhi);
- }
-
- // Fix the scalar loop reduction variable with the incoming reduction sum
- // from the vector body and from the backedge value.
- int IncomingEdgeBlockIdx =
- OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
- assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
- // Pick the other block.
- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
- OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- // If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF, provided it
- // is at least 1.
- //
- // For scalable vectors we can't know if interleaving is beneficial. It may
- // not be beneficial for small loops if none of the lanes in the second vector
- // iterations is enabled. However, for larger loops, there is likely to be a
- // similar benefit as for fixed-width vectors. For now, we choose to leave
- // the InterleaveCount as if vscale is '1', although if some information about
- // the vector is known (e.g. min vector size), we can make a better decision.
- if (BestKnownTC) {
- MaxInterleaveCount =
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
- // Make sure MaxInterleaveCount is greater than 0.
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
+ EstimatedVF *= *VScale;
+ }
+ assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+
+ unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (KnownTC) {
+ // If trip count is known we select between two prospective ICs, where
+ // 1) the aggressive IC is capped by the trip count divided by VF
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+ // The final IC is selected in a way that the epilogue loop trip count is
+ // minimized while maximizing the IC itself, so that we either run the
+ // vector loop at least once if it generates a small epilogue loop, or else
+ // we run the vector loop at least twice.
+
+ unsigned InterleaveCountUB = bit_floor(
+ std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+ unsigned InterleaveCountLB = bit_floor(std::max(
+ 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ MaxInterleaveCount = InterleaveCountLB;
+
+ if (InterleaveCountUB != InterleaveCountLB) {
+ unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+ // If both produce same scalar tail, maximize the IC to do the same work
+ // in fewer vector loop iterations
+ if (TailTripCountUB == TailTripCountLB)
+ MaxInterleaveCount = InterleaveCountUB;
+ }
+ } else if (BestKnownTC) {
+ // If trip count is an estimated compile time constant, limit the
+ // IC to be capped by the trip count divided by VF * 2, such that the vector
+ // loop runs at least twice to make interleaving seem profitable when there
+ // is an epilogue loop present. Since exact Trip count is not known we
+ // choose to be conservative in our IC estimate.
+ MaxInterleaveCount = bit_floor(std::max(
+ 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
}
assert(MaxInterleaveCount > 0 &&
@@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
-SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
+// create a merge phi node for it and add it to \p ReductionResumeValues.
+static void createAndCollectMergePhiForReduction(
+ VPInstruction *RedResult,
+ DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
+ VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+ if (!RedResult ||
+ RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+ return;
+
+ auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Value *FinalValue =
+ State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
+ auto *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+ // TODO: bc.merge.rdx should not be created here, instead it should be
+ // modeled in VPlan.
+ BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+
+ // If we are fixing reductions in the epilogue loop then we should already
+ // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+ // we carry over the incoming values correctly.
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ BCBlockPhi->addIncoming(FinalValue, Incoming);
+ else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+ Incoming);
+ else
+ BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ }
+
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // TODO: This fixup should instead be modeled in VPlan.
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
+ int IncomingEdgeBlockIdx =
+ OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+
+ ReductionResumeValues[&RdxDesc] = BCBlockPhi;
+}
+
+std::pair<DenseMap<const SCEV *, Value *>,
+ DenseMap<const RecurrenceDescriptor *, Value *>>
+LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
+ // 2.5 Collect reduction resume values.
+ DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
+ auto *ExitVPBB =
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
+ ReductionResumeValues, State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB]);
+ }
+
+ // 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
MDNode *OrigLoopID = OrigLoop->getLoopID();
@@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
- return State.ExpandedSCEVs;
+ return {State.ExpandedSCEVs, ReductionResumeValues};
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (ECEntryIt != EdgeMaskCache.end())
return ECEntryIt->second;
- VPValue *SrcMask = createBlockInMask(Src, Plan);
+ VPValue *SrcMask = getBlockInMask(Src);
// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
@@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
BlockMaskCache[Header] = BlockMask;
}
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
+VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+ // Return the cached value.
+ BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
+ assert(BCEntryIt != BlockMaskCache.end() &&
+ "Trying to access mask for block without one.");
+ return BCEntryIt->second;
+}
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+ assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
assert(OrigLoop->getHeader() != BB &&
"Loop header must have cached block mask");
@@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
- if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
- return BlockMaskCache[BB] = EdgeMask;
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
+ BlockMaskCache[BB] = EdgeMask;
+ }
if (!BlockMask) { // BlockMask has its initialized nullptr value.
BlockMask = EdgeMask;
@@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}
- return BlockMaskCache[BB] = BlockMask;
+ BlockMaskCache[BB] = BlockMask;
}
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
@@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), *Plan);
+ Mask = getBlockInMask(I->getParent());
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
- auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
- Reverse, I->getDebugLoc());
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ Ptr->getUnderlyingValue()->stripPointerCasts());
+ auto *VectorPtr = new VPVectorPointerRecipe(
+ Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
+ I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
}
@@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// all-true mask.
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(CI))
- Mask = createBlockInMask(CI->getParent(), *Plan);
+ Mask = getBlockInMask(CI->getParent());
else
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
@@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
- VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
+ VPValue *Mask = getBlockInMask(I->getParent());
VPValue *One = Plan->getVPValueOrAddLiveIn(
ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
@@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
// added initially. Masked replicate recipes will later be placed under an
// if-then construct to prevent side-effects. Generate recipes to compute
// the block mask for this region.
- BlockInMask = createBlockInMask(I->getParent(), Plan);
+ BlockInMask = getBlockInMask(I->getParent());
}
auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
@@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
- // Proactively create header mask. Masks for other blocks are created on
- // demand.
- RecipeBuilder.createHeaderMask(*Plan);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
VPBasicBlock *VPBB = HeaderVPBB;
+ bool NeedsMasks = CM.foldTailByMasking() ||
+ any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ });
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
+ if (VPBB == HeaderVPBB)
+ RecipeBuilder.createHeaderMask(*Plan);
+ else if (NeedsMasks)
+ RecipeBuilder.createBlockInMask(BB, *Plan);
+
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
@@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// to reductions, with one operand being vector and the other being the scalar
// reduction chain. For other reductions, a select is introduced between the phi
// and live-out recipes when folding the tail.
+//
+// A ComputeReductionResult recipe is added to the middle block, also for
+// in-loop reductions which compute their result in-loop, because generating
+// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
// sank outside of the loop would keep the same order as they had in the
// original loop.
@@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase *R : ReductionPHIList)
R->moveBefore(*Header, Header->getFirstNonPhi());
- SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
- InLoopReductionPhis.push_back(PhiR);
- }
- for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
@@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(CurrentLink);
- CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
+ CondOp = RecipeBuilder.getBlockInMask(BB);
}
VPReductionRecipe *RedRecipe = new VPReductionRecipe(
@@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
+ if (!PhiR)
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
// If tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the beginning of the
// dedicated latch block.
- if (CM.foldTailByMasking()) {
- VPValue *Cond =
- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
- VPValue *Red = PhiR->getBackedgeValue();
- assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
+ auto *OrigExitingVPV = PhiR->getBackedgeValue();
+ auto *NewExitingVPV = PhiR->getBackedgeValue();
+ if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
+ VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
+ assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
- FastMathFlags FMFs = RdxDesc.getFastMathFlags();
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
- Result =
+ std::optional<FastMathFlags> FMFs =
PhiTy->isFloatingPointTy()
- ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
- : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
- Result->insertBefore(&*Builder.getInsertPoint());
- Red->replaceUsesWithIf(
- Result->getVPSingleValue(),
- [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ ? std::make_optional(RdxDesc.getFastMathFlags())
+ : std::nullopt;
+ NewExitingVPV =
+ Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+ OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
+ return isa<VPInstruction>(&U) &&
+ cast<VPInstruction>(&U)->getOpcode() ==
+ VPInstruction::ComputeReductionResult;
+ });
if (PreferPredicatedReductionSelect ||
TTI.preferPredicatedReductionSelect(
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags()))
- PhiR->setOperand(1, Result->getVPSingleValue());
+ PhiR->setOperand(1, NewExitingVPV);
}
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
@@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxTy = RdxDesc.getRecurrenceType();
- auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
- Result->getVPSingleValue(), RdxTy);
+ auto *Trunc =
+ new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
auto *Extnd =
RdxDesc.isSigned()
? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
: new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
- Trunc->insertAfter(Result);
+ Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
Extnd->insertAfter(Trunc);
- Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
- Trunc->setOperand(0, Result->getVPSingleValue());
+ if (PhiR->getOperand(1) == NewExitingVPV)
+ PhiR->setOperand(1, Extnd->getVPSingleValue());
+ NewExitingVPV = Extnd;
}
+
+ // We want code in the middle block to appear to execute on the location of
+ // the scalar loop's latch terminator because: (a) it is all compiler
+ // generated, (b) these instructions are always executed after evaluating
+ // the latch conditional branch, and (c) other passes may add new
+ // predecessors which terminate on this line. This is the easiest way to
+ // ensure we don't accidentally cause an extra step back into the loop while
+ // debugging.
+ DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+
+ // TODO: At the moment ComputeReductionResult also drives creation of the
+ // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
+ // even for in-loop reductions, until the reduction resume value handling is
+ // also modeled in VPlan.
+ auto *FinalReductionResult = new VPInstruction(
+ VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
+ ->appendRecipe(FinalReductionResult);
+ OrigExitingVPV->replaceUsesWithIf(
+ FinalReductionResult,
+ [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
@@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks);
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
- auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
- BestMainPlan, MainILV, DT, true);
+ const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
+ EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = MainILV.getReductionResumeValue(
- ReductionPhi->getRecurrenceDescriptor());
+ ResumeV = ReductionResumeValues
+ .find(&ReductionPhi->getRecurrenceDescriptor())
+ ->second;
} else {
// Create induction resume values for both widened pointer and
// integer/fp inductions and update the start value of the induction
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 304991526064..8e22b54f002d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10596,7 +10596,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
- auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {
+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
+ unsigned I, unsigned SliceSize) {
if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
}))
@@ -10619,11 +10620,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Idx == 0) ||
(Mask.size() == InputVF &&
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
- std::iota(Mask.begin(), Mask.end(), 0);
+ std::iota(std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(), (I + 1) * SliceSize), 0);
} else {
- unsigned I =
+ unsigned IVal =
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- std::fill(Mask.begin(), Mask.end(), I);
+ std::fill(std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
}
return true;
};
@@ -10872,7 +10875,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
} else if (Vec1) {
IsUsedInExpr &= FindReusedSplat(
ExtractMask,
- cast<FixedVectorType>(Vec1->getType())->getNumElements());
+ cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
+ ExtractMask.size());
ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
} else {
@@ -10898,7 +10902,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
if (TEs.size() == 1) {
IsUsedInExpr &=
- FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
ShuffleBuilder.add(*TEs.front(), VecMask);
if (TEs.front()->VectorizedValue)
IsNonPoisoned &=
@@ -11139,6 +11143,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
+ if (const TreeEntry *TE = getTreeEntry(V))
+ V = TE->VectorizedValue;
setInsertPointAfterBundle(E);
V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
@@ -11903,8 +11909,10 @@ Value *BoUpSLP::vectorizeTree(
if (!Ex) {
// "Reuse" the existing extract to improve final codegen.
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
- Ex = Builder.CreateExtractElement(ES->getOperand(0),
- ES->getOperand(1));
+ Value *V = ES->getVectorOperand();
+ if (const TreeEntry *ETE = getTreeEntry(V))
+ V = ETE->VectorizedValue;
+ Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a0908..4b3143aead46 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -138,8 +138,11 @@ public:
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True or the loop mask when
- /// tail folding. It returns the *entry* mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
+ /// tail folding.
+ void createBlockInMask(BasicBlock *BB, VPlan &Plan);
+
+ /// Returns the *entry* mask for the block \p BB.
+ VPValue *getBlockInMask(BasicBlock *BB) const;
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d7df9c9575a..b6e56c47c227 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
// ExitBB can be re-used for the exit block of the Plan.
NewBB = State->CFG.ExitBB;
State->CFG.PrevBB = NewBB;
+ State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
// Update the branch instruction in the predecessor to branch to ExitBB.
VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 7d33baac52c9..4b4f4911eb64 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -842,6 +842,12 @@ public:
WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
};
+protected:
+ struct GEPFlagsTy {
+ char IsInBounds : 1;
+ GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
+ };
+
private:
struct DisjointFlagsTy {
char IsDisjoint : 1;
@@ -849,9 +855,6 @@ private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct GEPFlagsTy {
- char IsInBounds : 1;
- };
struct NonNegFlagsTy {
char NonNeg : 1;
};
@@ -933,12 +936,21 @@ public:
: VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp),
FMFs(FMFs) {}
+protected:
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ GEPFlagsTy GEPFlags, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::GEPOp),
+ GEPFlags(GEPFlags) {}
+
+public:
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
- R->getVPDefID() == VPRecipeBase::VPReplicateSC;
+ R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
}
/// Drop all poison-generating flags.
@@ -1061,7 +1073,8 @@ public:
// Increment the canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
BranchOnCount,
- BranchOnCond
+ BranchOnCond,
+ ComputeReductionResult,
};
private:
@@ -1360,15 +1373,16 @@ public:
/// A recipe to compute the pointers for widened memory accesses of IndexTy for
/// all parts. If IsReverse is true, compute pointers for accessing the input in
/// reverse order per part.
-class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
+class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue {
Type *IndexedTy;
bool IsReverse;
public:
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
- DebugLoc DL)
- : VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this),
- IndexedTy(IndexedTy), IsReverse(IsReverse) {}
+ bool IsInBounds, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
+ GEPFlagsTy(IsInBounds), DL),
+ VPValue(this), IndexedTy(IndexedTy), IsReverse(IsReverse) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -3132,6 +3146,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
return Rep->isUniform();
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
+ if (auto *VPI = dyn_cast<VPInstruction>(Def))
+ return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
return false;
}
} // end namespace vputils
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 76961629aece..1f844bce2310 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -28,6 +28,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
@@ -119,7 +120,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return false;
case VPInstructionSC:
switch (cast<VPInstruction>(this)->getOpcode()) {
+ case Instruction::Or:
case Instruction::ICmp:
+ case Instruction::Select:
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -401,6 +404,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
return CondBr;
}
+ case VPInstruction::ComputeReductionResult: {
+ if (Part != 0)
+ return State.get(this, 0);
+
+ // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
+ // and will be removed by breaking up the recipe further.
+ auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // Get its reduction variable descriptor.
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+
+ State.setDebugLocFrom(getDebugLoc());
+
+ VPValue *LoopExitingDef = getOperand(1);
+ Type *PhiTy = OrigPhi->getType();
+ VectorParts RdxParts(State.UF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = State.get(LoopExitingDef, Part);
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ // TODO: Handle this in truncateToMinBW.
+ if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ }
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+
+ if (PhiR->isOrdered()) {
+ ReducedPartRdx = RdxParts[State.UF - 1];
+ } else {
+ // Floating-point operations should have some FMF to enable the reduction.
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ for (unsigned Part = 1; Part < State.UF; ++Part) {
+ Value *RdxPart = RdxParts[Part];
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ ReducedPartRdx = Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ TrackingVH<Value> ReductionStartValue =
+ RdxDesc.getRecurrenceStartValue();
+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
+ } else
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+ }
+ }
+
+ // Create the reduction after the loop. Note that inloop reductions create
+ // the target reduction in the loop using a Reduction recipe.
+ if (State.VF.isVector() && !PhiR->isInLoop()) {
+ ReducedPartRdx =
+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (PhiTy != RdxDesc.getRecurrenceType())
+ ReducedPartRdx = RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+ : Builder.CreateZExt(ReducedPartRdx, PhiTy);
+ }
+
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ auto *NewSI = Builder.CreateAlignedStore(
+ ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
+ propagateMetadata(NewSI, SI);
+ }
+
+ return ReducedPartRdx;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -477,6 +558,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::ComputeReductionResult:
+ O << "compute-reduction-result";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1225,9 +1309,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
? DL.getIndexType(IndexedTy->getPointerTo())
: Builder.getInt32Ty();
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
- bool InBounds = false;
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = GEP->isInBounds();
+ bool InBounds = isInBounds();
if (IsReverse) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 33132880d5a4..5c430620a2dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -829,15 +829,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
Type *ATy = TypeInfo.inferScalarType(A);
if (TruncTy == ATy) {
Trunc->replaceAllUsesWith(A);
- } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
- auto *VPC =
- new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
- auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
+ } else {
+ // Don't replace a scalarizing recipe with a widened cast.
+ if (isa<VPReplicateRecipe>(&R))
+ break;
+ if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
}
#ifndef NDEBUG
// Verify that the cached type info is for both A and its users is still