diff options
Diffstat (limited to 'lib/Transforms/InstCombine/InstCombineCalls.cpp')
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineCalls.cpp | 532 |
1 files changed, 370 insertions, 162 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index cbfbd8a53993..aeb25d530d71 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -136,6 +136,14 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { if (Size > 8 || (Size&(Size-1))) return nullptr; // If not 1/2/4/8 bytes, exit. + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa<AtomicMemTransferInst>(MI)) + if (CopyDstAlign < Size || CopySrcAlign < Size) + return nullptr; + // Use an integer load+store unless we can find something better. unsigned SrcAddrSp = cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); @@ -174,6 +182,9 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); if (LoopMemParallelMD) L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); + if (AccessGroupMD) + L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. @@ -182,6 +193,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { S->setMetadata(LLVMContext::MD_tbaa, CopyMD); if (LoopMemParallelMD) S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + if (AccessGroupMD) + S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); if (auto *MT = dyn_cast<MemTransferInst>(MI)) { // non-atomics can be volatile @@ -215,6 +228,18 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { Alignment = MI->getDestAlignment(); assert(Len && "0-sized memory setting should be removed already."); + // Alignment 0 is identity for alignment 1 for memset, but not store. + if (Alignment == 0) + Alignment = 1; + + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa<AtomicMemSetInst>(MI)) + if (Alignment < Len) + return nullptr; + // memset(s,c,n) -> store s, c (for n=1,2,4,8) if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. @@ -224,9 +249,6 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) Alignment = 1; - // Extract the fill value and store. uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, @@ -648,7 +670,7 @@ static Value *simplifyX86round(IntrinsicInst &II, } Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor; - Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II); + Value *Res = Builder.CreateUnaryIntrinsic(ID, Src, &II); if (!IsScalar) { if (auto *C = dyn_cast<Constant>(Mask)) if (C->isAllOnesValue()) @@ -675,7 +697,8 @@ static Value *simplifyX86round(IntrinsicInst &II, return Builder.CreateInsertElement(Dst, Res, (uint64_t)0); } -static Value *simplifyX86movmsk(const IntrinsicInst &II) { +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { Value *Arg = II.getArgOperand(0); Type *ResTy = II.getType(); Type *ArgTy = Arg->getType(); @@ -688,29 +711,46 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II) { if (!ArgTy->isVectorTy()) return nullptr; - auto *C = dyn_cast<Constant>(Arg); - if (!C) - return nullptr; + if (auto *C = dyn_cast<Constant>(Arg)) { + // Extract signbits of the vector input and pack into integer result. + APInt Result(ResTy->getPrimitiveSizeInBits(), 0); + for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { + auto *COp = C->getAggregateElement(I); + if (!COp) + return nullptr; + if (isa<UndefValue>(COp)) + continue; - // Extract signbits of the vector input and pack into integer result. - APInt Result(ResTy->getPrimitiveSizeInBits(), 0); - for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { - auto *COp = C->getAggregateElement(I); - if (!COp) - return nullptr; - if (isa<UndefValue>(COp)) - continue; + auto *CInt = dyn_cast<ConstantInt>(COp); + auto *CFp = dyn_cast<ConstantFP>(COp); + if (!CInt && !CFp) + return nullptr; - auto *CInt = dyn_cast<ConstantInt>(COp); - auto *CFp = dyn_cast<ConstantFP>(COp); - if (!CInt && !CFp) - return nullptr; + if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) + Result.setBit(I); + } + return Constant::getIntegerValue(ResTy, Result); + } - if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) - Result.setBit(I); + // Look for a sign-extended boolean source vector as the argument to this + // movmsk. If the argument is bitcast, look through that, but make sure the + // source of that bitcast is still a vector with the same number of elements. + // TODO: We can also convert a bitcast with wider elements, but that requires + // duplicating the bool source sign bits to match the number of elements + // expected by the movmsk call. + Arg = peekThroughBitcast(Arg); + Value *X; + if (Arg->getType()->isVectorTy() && + Arg->getType()->getVectorNumElements() == ArgTy->getVectorNumElements() && + match(Arg, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + // call iM movmsk(sext <N x i1> X) --> zext (bitcast <N x i1> X to iN) to iM + unsigned NumElts = X->getType()->getVectorNumElements(); + Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts); + Value *BC = Builder.CreateBitCast(X, ScalarTy); + return Builder.CreateZExtOrTrunc(BC, ResTy); } - return Constant::getIntegerValue(ResTy, Result); + return nullptr; } static Value *simplifyX86insertps(const IntrinsicInst &II, @@ -1133,82 +1173,6 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II, return nullptr; } -static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - - // fmin(x, x) -> x - if (Arg0 == Arg1) - return Arg0; - - const auto *C1 = dyn_cast<ConstantFP>(Arg1); - - // fmin(x, nan) -> x - if (C1 && C1->isNaN()) - return Arg0; - - // This is the value because if undef were NaN, we would return the other - // value and cannot return a NaN unless both operands are. - // - // fmin(undef, x) -> x - if (isa<UndefValue>(Arg0)) - return Arg1; - - // fmin(x, undef) -> x - if (isa<UndefValue>(Arg1)) - return Arg0; - - Value *X = nullptr; - Value *Y = nullptr; - if (II.getIntrinsicID() == Intrinsic::minnum) { - // fmin(x, fmin(x, y)) -> fmin(x, y) - // fmin(y, fmin(x, y)) -> fmin(x, y) - if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) { - if (Arg0 == X || Arg0 == Y) - return Arg1; - } - - // fmin(fmin(x, y), x) -> fmin(x, y) - // fmin(fmin(x, y), y) -> fmin(x, y) - if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) { - if (Arg1 == X || Arg1 == Y) - return Arg0; - } - - // TODO: fmin(nnan x, inf) -> x - // TODO: fmin(nnan ninf x, flt_max) -> x - if (C1 && C1->isInfinity()) { - // fmin(x, -inf) -> -inf - if (C1->isNegative()) - return Arg1; - } - } else { - assert(II.getIntrinsicID() == Intrinsic::maxnum); - // fmax(x, fmax(x, y)) -> fmax(x, y) - // fmax(y, fmax(x, y)) -> fmax(x, y) - if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) { - if (Arg0 == X || Arg0 == Y) - return Arg1; - } - - // fmax(fmax(x, y), x) -> fmax(x, y) - // fmax(fmax(x, y), y) -> fmax(x, y) - if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) { - if (Arg1 == X || Arg1 == Y) - return Arg0; - } - - // TODO: fmax(nnan x, -inf) -> x - // TODO: fmax(nnan ninf x, -flt_max) -> x - if (C1 && C1->isInfinity()) { - // fmax(x, inf) -> inf - if (!C1->isNegative()) - return Arg1; - } - } - return nullptr; -} - static bool maskIsAllOneOrUndef(Value *Mask) { auto *ConstMask = dyn_cast<Constant>(Mask); if (!ConstMask) @@ -1852,6 +1816,17 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { return nullptr; } +static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { + assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); + Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); + if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { + Call.setArgOperand(0, Arg1); + Call.setArgOperand(1, Arg0); + return &Call; + } + return nullptr; +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallSite to do the heavy /// lifting. @@ -2005,18 +1980,49 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return I; break; + case Intrinsic::fshl: + case Intrinsic::fshr: { + const APInt *SA; + if (match(II->getArgOperand(2), m_APInt(SA))) { + Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); + unsigned BitWidth = SA->getBitWidth(); + uint64_t ShiftAmt = SA->urem(BitWidth); + assert(ShiftAmt != 0 && "SimplifyCall should have handled zero shift"); + // Normalize to funnel shift left. + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + // fshl(X, 0, C) -> shl X, C + // fshl(X, undef, C) -> shl X, C + if (match(Op1, m_Zero()) || match(Op1, m_Undef())) + return BinaryOperator::CreateShl( + Op0, ConstantInt::get(II->getType(), ShiftAmt)); + + // fshl(0, X, C) -> lshr X, (BW-C) + // fshl(undef, X, C) -> lshr X, (BW-C) + if (match(Op0, m_Zero()) || match(Op0, m_Undef())) + return BinaryOperator::CreateLShr( + Op1, ConstantInt::get(II->getType(), BitWidth - ShiftAmt)); + } + + // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, + // so only the low bits of the shift amount are demanded if the bitwidth is + // a power-of-2. + unsigned BitWidth = II->getType()->getScalarSizeInBits(); + if (!isPowerOf2_32(BitWidth)) + break; + APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); + KnownBits Op2Known(BitWidth); + if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) + return &CI; + break; + } case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: - if (isa<Constant>(II->getArgOperand(0)) && - !isa<Constant>(II->getArgOperand(1))) { - // Canonicalize constants into the RHS. - Value *LHS = II->getArgOperand(0); - II->setArgOperand(0, II->getArgOperand(1)); - II->setArgOperand(1, LHS); - return II; - } + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; LLVM_FALLTHROUGH; case Intrinsic::usub_with_overflow: @@ -2034,34 +2040,164 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } - case Intrinsic::minnum: - case Intrinsic::maxnum: { + case Intrinsic::uadd_sat: + case Intrinsic::sadd_sat: + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + LLVM_FALLTHROUGH; + case Intrinsic::usub_sat: + case Intrinsic::ssub_sat: { Value *Arg0 = II->getArgOperand(0); Value *Arg1 = II->getArgOperand(1); - // Canonicalize constants to the RHS. - if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) { - II->setArgOperand(0, Arg1); - II->setArgOperand(1, Arg0); - return II; + Intrinsic::ID IID = II->getIntrinsicID(); + + // Make use of known overflow information. + OverflowResult OR; + switch (IID) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::uadd_sat: + OR = computeOverflowForUnsignedAdd(Arg0, Arg1, II); + if (OR == OverflowResult::NeverOverflows) + return BinaryOperator::CreateNUWAdd(Arg0, Arg1); + if (OR == OverflowResult::AlwaysOverflows) + return replaceInstUsesWith(*II, + ConstantInt::getAllOnesValue(II->getType())); + break; + case Intrinsic::usub_sat: + OR = computeOverflowForUnsignedSub(Arg0, Arg1, II); + if (OR == OverflowResult::NeverOverflows) + return BinaryOperator::CreateNUWSub(Arg0, Arg1); + if (OR == OverflowResult::AlwaysOverflows) + return replaceInstUsesWith(*II, + ConstantInt::getNullValue(II->getType())); + break; + case Intrinsic::sadd_sat: + if (willNotOverflowSignedAdd(Arg0, Arg1, *II)) + return BinaryOperator::CreateNSWAdd(Arg0, Arg1); + break; + case Intrinsic::ssub_sat: + if (willNotOverflowSignedSub(Arg0, Arg1, *II)) + return BinaryOperator::CreateNSWSub(Arg0, Arg1); + break; } - // FIXME: Simplifications should be in instsimplify. - if (Value *V = simplifyMinnumMaxnum(*II)) - return replaceInstUsesWith(*II, V); + // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN + Constant *C; + if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && + C->isNotMinSignedValue()) { + Value *NegVal = ConstantExpr::getNeg(C); + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + Intrinsic::sadd_sat, Arg0, NegVal)); + } + + // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) + // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) + // if Val and Val2 have the same sign + if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { + Value *X; + const APInt *Val, *Val2; + APInt NewVal; + bool IsUnsigned = + IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; + if (Other->getIntrinsicID() == II->getIntrinsicID() && + match(Arg1, m_APInt(Val)) && + match(Other->getArgOperand(0), m_Value(X)) && + match(Other->getArgOperand(1), m_APInt(Val2))) { + if (IsUnsigned) + NewVal = Val->uadd_sat(*Val2); + else if (Val->isNonNegative() == Val2->isNonNegative()) { + bool Overflow; + NewVal = Val->sadd_ov(*Val2, Overflow); + if (Overflow) { + // Both adds together may add more than SignedMaxValue + // without saturating the final result. + break; + } + } else { + // Cannot fold saturated addition with different signs. + break; + } + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + IID, X, ConstantInt::get(II->getType(), NewVal))); + } + } + break; + } + + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::minimum: + case Intrinsic::maximum: { + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + Intrinsic::ID IID = II->getIntrinsicID(); Value *X, *Y; if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && (Arg0->hasOneUse() || Arg1->hasOneUse())) { // If both operands are negated, invert the call and negate the result: - // minnum(-X, -Y) --> -(maxnum(X, Y)) - // maxnum(-X, -Y) --> -(minnum(X, Y)) - Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ? - Intrinsic::minnum : Intrinsic::maxnum; - Value *NewCall = Builder.CreateIntrinsic(NewIID, { X, Y }, II); + // min(-X, -Y) --> -(max(X, Y)) + // max(-X, -Y) --> -(min(X, Y)) + Intrinsic::ID NewIID; + switch (IID) { + case Intrinsic::maxnum: + NewIID = Intrinsic::minnum; + break; + case Intrinsic::minnum: + NewIID = Intrinsic::maxnum; + break; + case Intrinsic::maximum: + NewIID = Intrinsic::minimum; + break; + case Intrinsic::minimum: + NewIID = Intrinsic::maximum; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } + Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall); FNeg->copyIRFlags(II); return FNeg; } + + // m(m(X, C2), C1) -> m(X, C) + const APFloat *C1, *C2; + if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { + if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && + ((match(M->getArgOperand(0), m_Value(X)) && + match(M->getArgOperand(1), m_APFloat(C2))) || + (match(M->getArgOperand(1), m_Value(X)) && + match(M->getArgOperand(0), m_APFloat(C2))))) { + APFloat Res(0.0); + switch (IID) { + case Intrinsic::maxnum: + Res = maxnum(*C1, *C2); + break; + case Intrinsic::minnum: + Res = minnum(*C1, *C2); + break; + case Intrinsic::maximum: + Res = maximum(*C1, *C2); + break; + case Intrinsic::minimum: + Res = minimum(*C1, *C2); + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } + Instruction *NewCall = Builder.CreateBinaryIntrinsic( + IID, X, ConstantFP::get(Arg0->getType(), Res)); + NewCall->copyIRFlags(II); + return replaceInstUsesWith(*II, NewCall); + } + } + break; } case Intrinsic::fmuladd: { @@ -2079,17 +2215,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { LLVM_FALLTHROUGH; } case Intrinsic::fma: { - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - - // Canonicalize constant multiply operand to Src1. - if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { - II->setArgOperand(0, Src1); - II->setArgOperand(1, Src0); - std::swap(Src0, Src1); - } + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; // fma fneg(x), fneg(y), z -> fma x, y, z + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); Value *X, *Y; if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { II->setArgOperand(0, X); @@ -2135,24 +2266,33 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *ExtSrc; if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) - Value *NarrowII = Builder.CreateIntrinsic(II->getIntrinsicID(), - { ExtSrc }, II); + Value *NarrowII = + Builder.CreateUnaryIntrinsic(II->getIntrinsicID(), ExtSrc, II); return new FPExtInst(NarrowII, II->getType()); } break; } case Intrinsic::cos: case Intrinsic::amdgcn_cos: { - Value *SrcSrc; + Value *X; Value *Src = II->getArgOperand(0); - if (match(Src, m_FNeg(m_Value(SrcSrc))) || - match(Src, m_FAbs(m_Value(SrcSrc)))) { + if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { // cos(-x) -> cos(x) // cos(fabs(x)) -> cos(x) - II->setArgOperand(0, SrcSrc); + II->setArgOperand(0, X); return II; } - + break; + } + case Intrinsic::sin: { + Value *X; + if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { + // sin(-x) --> -sin(x) + Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); + Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin); + FNeg->copyFastMathFlags(II); + return FNeg; + } break; } case Intrinsic::ppc_altivec_lvx: @@ -2382,7 +2522,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_avx_movmsk_ps_256: case Intrinsic::x86_avx2_pmovmskb: - if (Value *V = simplifyX86movmsk(*II)) + if (Value *V = simplifyX86movmsk(*II, Builder)) return replaceInstUsesWith(*II, V); break; @@ -2922,16 +3062,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx_blendv_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx2_pblendvb: { - // Convert blendv* to vector selects if the mask is constant. - // This optimization is convoluted because the intrinsic is defined as - // getting a vector of floats or doubles for the ps and pd versions. - // FIXME: That should be changed. - + // fold (blend A, A, Mask) -> A Value *Op0 = II->getArgOperand(0); Value *Op1 = II->getArgOperand(1); Value *Mask = II->getArgOperand(2); - - // fold (blend A, A, Mask) -> A if (Op0 == Op1) return replaceInstUsesWith(CI, Op0); @@ -2944,6 +3078,33 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + Mask = peekThroughBitcast(Mask); + if (match(Mask, m_SExt(m_Value(BoolVec))) && + BoolVec->getType()->isVectorTy() && + BoolVec->getType()->getScalarSizeInBits() == 1) { + assert(Mask->getType()->getPrimitiveSizeInBits() == + II->getType()->getPrimitiveSizeInBits() && + "Not expecting mask and operands with different sizes"); + + unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); + unsigned NumOperandElts = II->getType()->getVectorNumElements(); + if (NumMaskElts == NumOperandElts) + return SelectInst::Create(BoolVec, Op1, Op0); + + // If the mask has less elements than the operands, each mask bit maps to + // multiple elements of the operands. Bitcast back and forth. + if (NumMaskElts < NumOperandElts) { + Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); + Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); + Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); + return new BitCastInst(Sel, II->getType()); + } + } + break; } @@ -3275,6 +3436,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, FCmp); } + if (Mask == (N_ZERO | P_ZERO)) { + // Equivalent of == 0. + Value *FCmp = Builder.CreateFCmpOEQ( + Src0, ConstantFP::get(Src0->getType(), 0.0)); + + FCmp->takeName(II); + return replaceInstUsesWith(*II, FCmp); + } + + // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other + if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) { + II->setArgOperand(1, ConstantInt::get(Src1->getType(), + Mask & ~(S_NAN | Q_NAN))); + return II; + } + const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); if (!CVal) { if (isa<UndefValue>(Src0)) @@ -3384,22 +3561,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe; - // TODO: Also emit sub if only width is constant. - if (!CWidth && COffset && Offset == 0) { - Constant *KSize = ConstantInt::get(COffset->getType(), IntSize); - Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2)); - ShiftVal = Builder.CreateZExt(ShiftVal, II->getType()); - - Value *Shl = Builder.CreateShl(Src, ShiftVal); - Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal) - : Builder.CreateLShr(Shl, ShiftVal); - RightShift->takeName(II); - return replaceInstUsesWith(*II, RightShift); - } - if (!CWidth || !COffset) break; + // The case of Width == 0 is handled above, which makes this tranformation + // safe. If Width == 0, then the ashr and lshr instructions become poison + // value since the shift amount would be equal to the bit size. + assert(Width != 0); + // TODO: This allows folding to undef when the hardware has specific // behavior? if (Offset + Width < IntSize) { @@ -3603,6 +3772,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; + Type *Ty = SrcLHS->getType(); + if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { + // Promote to next legal integer type. + unsigned Width = CmpType->getBitWidth(); + unsigned NewWidth = Width; + + // Don't do anything for i1 comparisons. + if (Width == 1) + break; + + if (Width <= 16) + NewWidth = 16; + else if (Width <= 32) + NewWidth = 32; + else if (Width <= 64) + NewWidth = 64; + else if (Width > 64) + break; // Can't handle this. + + if (Width != NewWidth) { + IntegerType *CmpTy = Builder.getIntNTy(NewWidth); + if (CmpInst::isSigned(SrcPred)) { + SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); + SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); + } else { + SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); + SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); + } + } + } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) + break; + Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID, SrcLHS->getType()); Value *Args[] = { SrcLHS, SrcRHS, @@ -3661,7 +3862,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Scan down this block to see if there is another stack restore in the // same block without an intervening call/alloca. BasicBlock::iterator BI(II); - TerminatorInst *TI = II->getParent()->getTerminator(); + Instruction *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { if (isa<AllocaInst>(BI)) { @@ -3788,8 +3989,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); // isKnownNonNull -> nonnull attribute - if (isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) + if (!II->hasRetAttr(Attribute::NonNull) && + isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + return II; + } } // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) @@ -3889,7 +4093,11 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { auto InstCombineRAUW = [this](Instruction *From, Value *With) { replaceInstUsesWith(*From, With); }; - LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW); + auto InstCombineErase = [this](Instruction *I) { + eraseInstFromFunction(*I); + }; + LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW, + InstCombineErase); if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); |