diff options
Diffstat (limited to 'lib/Transforms/InstCombine/InstCombineCalls.cpp')
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineCalls.cpp | 1148 |
1 files changed, 1001 insertions, 147 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 2ef82ba3ed8c4..69484f47223f7 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -60,6 +60,12 @@ using namespace PatternMatch; STATISTIC(NumSimplified, "Number of library calls simplified"); +static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements( + "unfold-element-atomic-memcpy-max-elements", + cl::init(16), + cl::desc("Maximum number of elements in atomic memcpy the optimizer is " + "allowed to unfold")); + /// Return the specified type promoted as it would be to pass though a va_arg /// area. static Type *getPromotedType(Type *Ty) { @@ -70,27 +76,6 @@ static Type *getPromotedType(Type *Ty) { return Ty; } -/// Given an aggregate type which ultimately holds a single scalar element, -/// like {{{type}}} or [1 x type], return type. -static Type *reduceToSingleValueType(Type *T) { - while (!T->isSingleValueType()) { - if (StructType *STy = dyn_cast<StructType>(T)) { - if (STy->getNumElements() == 1) - T = STy->getElementType(0); - else - break; - } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) { - if (ATy->getNumElements() == 1) - T = ATy->getElementType(); - else - break; - } else - break; - } - - return T; -} - /// Return a constant boolean vector that has true elements in all positions /// where the input constant data vector has an element with the sign bit set. static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { @@ -108,6 +93,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { return ConstantVector::get(BoolVec); } +Instruction * +InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) { + // Try to unfold this intrinsic into sequence of explicit atomic loads and + // stores. + // First check that number of elements is compile time constant. + auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements()); + if (!NumElementsCI) + return nullptr; + + // Check that there are not too many elements. + uint64_t NumElements = NumElementsCI->getZExtValue(); + if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) + return nullptr; + + // Don't unfold into illegal integers + uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8; + if (!getDataLayout().isLegalInteger(ElementSizeInBytes)) + return nullptr; + + // Cast source and destination to the correct type. Intrinsic input arguments + // are usually represented as i8*. + // Often operands will be explicitly casted to i8* and we can just strip + // those casts instead of inserting new ones. However it's easier to rely on + // other InstCombine rules which will cover trivial cases anyway. + Value *Src = AMI->getRawSource(); + Value *Dst = AMI->getRawDest(); + Type *ElementPointerType = Type::getIntNPtrTy( + AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace()); + + Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, + "memcpy_unfold.src_casted"); + Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, + "memcpy_unfold.dst_casted"); + + for (uint64_t i = 0; i < NumElements; ++i) { + // Get current element addresses + ConstantInt *ElementIdxCI = + ConstantInt::get(AMI->getContext(), APInt(64, i)); + Value *SrcElementAddr = + Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); + Value *DstElementAddr = + Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); + + // Load from the source. Transfer alignment information and mark load as + // unordered atomic. + LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); + Load->setOrdering(AtomicOrdering::Unordered); + // We know alignment of the first element. It is also guaranteed by the + // verifier that element size is less or equal than first element alignment + // and both of this values are powers of two. + // This means that all subsequent accesses are at least element size + // aligned. + // TODO: We can infer better alignment but there is no evidence that this + // will matter. + Load->setAlignment(i == 0 ? AMI->getSrcAlignment() + : AMI->getElementSizeInBytes()); + Load->setDebugLoc(AMI->getDebugLoc()); + + // Store loaded value via unordered atomic store. + StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); + Store->setOrdering(AtomicOrdering::Unordered); + Store->setAlignment(i == 0 ? AMI->getDstAlignment() + : AMI->getElementSizeInBytes()); + Store->setDebugLoc(AMI->getDebugLoc()); + } + + // Set the number of elements of the copy to 0, it will be deleted on the + // next iteration. + AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType())); + return AMI; +} + Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); @@ -144,41 +201,19 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); - // Memcpy forces the use of i8* for the source and destination. That means - // that if you're using memcpy to move one double around, you'll get a cast - // from double* to i8*. We'd much rather use a double load+store rather than - // an i64 load+store, here because this improves the odds that the source or - // dest address will be promotable. See if we can find a better type than the - // integer datatype. - Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts(); + // If the memcpy has metadata describing the members, see if we can get the + // TBAA tag describing our copy. MDNode *CopyMD = nullptr; - if (StrippedDest != MI->getArgOperand(0)) { - Type *SrcETy = cast<PointerType>(StrippedDest->getType()) - ->getElementType(); - if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) { - // The SrcETy might be something like {{{double}}} or [1 x double]. Rip - // down through these levels if so. - SrcETy = reduceToSingleValueType(SrcETy); - - if (SrcETy->isSingleValueType()) { - NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp); - NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp); - - // If the memcpy has metadata describing the members, see if we can - // get the TBAA tag describing our copy. - if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { - if (M->getNumOperands() == 3 && M->getOperand(0) && - mdconst::hasa<ConstantInt>(M->getOperand(0)) && - mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() && - M->getOperand(1) && - mdconst::hasa<ConstantInt>(M->getOperand(1)) && - mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == - Size && - M->getOperand(2) && isa<MDNode>(M->getOperand(2))) - CopyMD = cast<MDNode>(M->getOperand(2)); - } - } - } + if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { + if (M->getNumOperands() == 3 && M->getOperand(0) && + mdconst::hasa<ConstantInt>(M->getOperand(0)) && + mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() && + M->getOperand(1) && + mdconst::hasa<ConstantInt>(M->getOperand(1)) && + mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == + Size && + M->getOperand(2) && isa<MDNode>(M->getOperand(2))) + CopyMD = cast<MDNode>(M->getOperand(2)); } // If the memcpy/memmove provides better alignment info than we can @@ -510,6 +545,131 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, return Builder.CreateAShr(Vec, ShiftVec); } +static Value *simplifyX86muldq(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + assert(Arg0->getType()->getScalarSizeInBits() == 32 && + Arg1->getType()->getScalarSizeInBits() == 32 && + ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types"); + + // muldq/muludq(undef, undef) -> zero (matches generic mul behavior) + if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) + return ConstantAggregateZero::get(ResTy); + + // Constant folding. + // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)), + // vXi64 sext(shuffle<0,2,..>(Arg1)))) + // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)), + // vXi64 zext(shuffle<0,2,..>(Arg1)))) + if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) + return nullptr; + + unsigned NumElts = ResTy->getVectorNumElements(); + assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) && + Arg1->getType()->getVectorNumElements() == (2 * NumElts) && + "Unexpected muldq/muludq types"); + + unsigned IntrinsicID = II.getIntrinsicID(); + bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID || + Intrinsic::x86_avx2_pmul_dq == IntrinsicID || + Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID); + + SmallVector<unsigned, 16> ShuffleMask; + for (unsigned i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i * 2); + + auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask); + auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask); + + if (IsSigned) { + LHS = Builder.CreateSExt(LHS, ResTy); + RHS = Builder.CreateSExt(RHS, ResTy); + } else { + LHS = Builder.CreateZExt(LHS, ResTy); + RHS = Builder.CreateZExt(RHS, ResTy); + } + + return Builder.CreateMul(LHS, RHS); +} + +static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) + return UndefValue::get(ResTy); + + Type *ArgTy = Arg0->getType(); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumDstElts = ResTy->getVectorNumElements(); + unsigned NumSrcElts = ArgTy->getVectorNumElements(); + assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); + + unsigned NumDstEltsPerLane = NumDstElts / NumLanes; + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + auto *Cst0 = dyn_cast<Constant>(Arg0); + auto *Cst1 = dyn_cast<Constant>(Arg1); + if (!Cst0 || !Cst1) + return nullptr; + + SmallVector<Constant *, 32> Vals; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { + unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; + auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; + auto *COp = Cst->getAggregateElement(SrcIdx); + if (COp && isa<UndefValue>(COp)) { + Vals.push_back(UndefValue::get(ResTy->getScalarType())); + continue; + } + + auto *CInt = dyn_cast_or_null<ConstantInt>(COp); + if (!CInt) + return nullptr; + + APInt Val = CInt->getValue(); + assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() && + "Unexpected constant bitwidth"); + + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + if (Val.isSignedIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getSignedMinValue(DstScalarSizeInBits); + else + Val = APInt::getSignedMaxValue(DstScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + if (Val.isIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getNullValue(DstScalarSizeInBits); + else + Val = APInt::getAllOnesValue(DstScalarSizeInBits); + } + + Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); + } + } + + return ConstantVector::get(Vals); +} + static Value *simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { Value *Arg = II.getArgOperand(0); @@ -1330,6 +1490,27 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { return true; } +// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. +// +// A single NaN input is folded to minnum, so we rely on that folding for +// handling NaNs. +static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, + const APFloat &Src2) { + APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); + + APFloat::cmpResult Cmp0 = Max3.compare(Src0); + assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp0 == APFloat::cmpEqual) + return maxnum(Src1, Src2); + + APFloat::cmpResult Cmp1 = Max3.compare(Src1); + assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp1 == APFloat::cmpEqual) + return maxnum(Src0, Src2); + + return maxnum(Src0, Src1); +} + // Returns true iff the 2 intrinsics have the same operands, limiting the // comparison to the first NumOperands. static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, @@ -1373,6 +1554,254 @@ static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, return false; } +// Convert NVVM intrinsics to target-generic LLVM code where possible. +static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { + // Each NVVM intrinsic we can simplify can be replaced with one of: + // + // * an LLVM intrinsic, + // * an LLVM cast operation, + // * an LLVM binary operation, or + // * ad-hoc LLVM IR for the particular operation. + + // Some transformations are only valid when the module's + // flush-denormals-to-zero (ftz) setting is true/false, whereas other + // transformations are valid regardless of the module's ftz setting. + enum FtzRequirementTy { + FTZ_Any, // Any ftz setting is ok. + FTZ_MustBeOn, // Transformation is valid only if ftz is on. + FTZ_MustBeOff, // Transformation is valid only if ftz is off. + }; + // Classes of NVVM intrinsics that can't be replaced one-to-one with a + // target-generic intrinsic, cast op, or binary op but that we can nonetheless + // simplify. + enum SpecialCase { + SPC_Reciprocal, + }; + + // SimplifyAction is a poor-man's variant (plus an additional flag) that + // represents how to replace an NVVM intrinsic with target-generic LLVM IR. + struct SimplifyAction { + // Invariant: At most one of these Optionals has a value. + Optional<Intrinsic::ID> IID; + Optional<Instruction::CastOps> CastOp; + Optional<Instruction::BinaryOps> BinaryOp; + Optional<SpecialCase> Special; + + FtzRequirementTy FtzRequirement = FTZ_Any; + + SimplifyAction() = default; + + SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) + : IID(IID), FtzRequirement(FtzReq) {} + + // Cast operations don't have anything to do with FTZ, so we skip that + // argument. + SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} + + SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) + : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} + + SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) + : Special(Special), FtzRequirement(FtzReq) {} + }; + + // Try to generate a SimplifyAction describing how to replace our + // IntrinsicInstr with target-generic LLVM IR. + const SimplifyAction Action = [II]() -> SimplifyAction { + switch (II->getIntrinsicID()) { + + // NVVM intrinsics that map directly to LLVM intrinsics. + case Intrinsic::nvvm_ceil_d: + return {Intrinsic::ceil, FTZ_Any}; + case Intrinsic::nvvm_ceil_f: + return {Intrinsic::ceil, FTZ_MustBeOff}; + case Intrinsic::nvvm_ceil_ftz_f: + return {Intrinsic::ceil, FTZ_MustBeOn}; + case Intrinsic::nvvm_fabs_d: + return {Intrinsic::fabs, FTZ_Any}; + case Intrinsic::nvvm_fabs_f: + return {Intrinsic::fabs, FTZ_MustBeOff}; + case Intrinsic::nvvm_fabs_ftz_f: + return {Intrinsic::fabs, FTZ_MustBeOn}; + case Intrinsic::nvvm_floor_d: + return {Intrinsic::floor, FTZ_Any}; + case Intrinsic::nvvm_floor_f: + return {Intrinsic::floor, FTZ_MustBeOff}; + case Intrinsic::nvvm_floor_ftz_f: + return {Intrinsic::floor, FTZ_MustBeOn}; + case Intrinsic::nvvm_fma_rn_d: + return {Intrinsic::fma, FTZ_Any}; + case Intrinsic::nvvm_fma_rn_f: + return {Intrinsic::fma, FTZ_MustBeOff}; + case Intrinsic::nvvm_fma_rn_ftz_f: + return {Intrinsic::fma, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_d: + return {Intrinsic::maxnum, FTZ_Any}; + case Intrinsic::nvvm_fmax_f: + return {Intrinsic::maxnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmax_ftz_f: + return {Intrinsic::maxnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_d: + return {Intrinsic::minnum, FTZ_Any}; + case Intrinsic::nvvm_fmin_f: + return {Intrinsic::minnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmin_ftz_f: + return {Intrinsic::minnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_round_d: + return {Intrinsic::round, FTZ_Any}; + case Intrinsic::nvvm_round_f: + return {Intrinsic::round, FTZ_MustBeOff}; + case Intrinsic::nvvm_round_ftz_f: + return {Intrinsic::round, FTZ_MustBeOn}; + case Intrinsic::nvvm_sqrt_rn_d: + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_f: + // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the + // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts + // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are + // the versions with explicit ftz-ness. + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_rn_f: + return {Intrinsic::sqrt, FTZ_MustBeOff}; + case Intrinsic::nvvm_sqrt_rn_ftz_f: + return {Intrinsic::sqrt, FTZ_MustBeOn}; + case Intrinsic::nvvm_trunc_d: + return {Intrinsic::trunc, FTZ_Any}; + case Intrinsic::nvvm_trunc_f: + return {Intrinsic::trunc, FTZ_MustBeOff}; + case Intrinsic::nvvm_trunc_ftz_f: + return {Intrinsic::trunc, FTZ_MustBeOn}; + + // NVVM intrinsics that map to LLVM cast operations. + // + // Note that llvm's target-generic conversion operators correspond to the rz + // (round to zero) versions of the nvvm conversion intrinsics, even though + // most everything else here uses the rn (round to nearest even) nvvm ops. + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_f2ll_rz: + return {Instruction::FPToSI}; + case Intrinsic::nvvm_d2ui_rz: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_d2ull_rz: + case Intrinsic::nvvm_f2ull_rz: + return {Instruction::FPToUI}; + case Intrinsic::nvvm_i2d_rz: + case Intrinsic::nvvm_i2f_rz: + case Intrinsic::nvvm_ll2d_rz: + case Intrinsic::nvvm_ll2f_rz: + return {Instruction::SIToFP}; + case Intrinsic::nvvm_ui2d_rz: + case Intrinsic::nvvm_ui2f_rz: + case Intrinsic::nvvm_ull2d_rz: + case Intrinsic::nvvm_ull2f_rz: + return {Instruction::UIToFP}; + + // NVVM intrinsics that map to LLVM binary ops. + case Intrinsic::nvvm_add_rn_d: + return {Instruction::FAdd, FTZ_Any}; + case Intrinsic::nvvm_add_rn_f: + return {Instruction::FAdd, FTZ_MustBeOff}; + case Intrinsic::nvvm_add_rn_ftz_f: + return {Instruction::FAdd, FTZ_MustBeOn}; + case Intrinsic::nvvm_mul_rn_d: + return {Instruction::FMul, FTZ_Any}; + case Intrinsic::nvvm_mul_rn_f: + return {Instruction::FMul, FTZ_MustBeOff}; + case Intrinsic::nvvm_mul_rn_ftz_f: + return {Instruction::FMul, FTZ_MustBeOn}; + case Intrinsic::nvvm_div_rn_d: + return {Instruction::FDiv, FTZ_Any}; + case Intrinsic::nvvm_div_rn_f: + return {Instruction::FDiv, FTZ_MustBeOff}; + case Intrinsic::nvvm_div_rn_ftz_f: + return {Instruction::FDiv, FTZ_MustBeOn}; + + // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but + // need special handling. + // + // We seem to be mising intrinsics for rcp.approx.{ftz.}f32, which is just + // as well. + case Intrinsic::nvvm_rcp_rn_d: + return {SPC_Reciprocal, FTZ_Any}; + case Intrinsic::nvvm_rcp_rn_f: + return {SPC_Reciprocal, FTZ_MustBeOff}; + case Intrinsic::nvvm_rcp_rn_ftz_f: + return {SPC_Reciprocal, FTZ_MustBeOn}; + + // We do not currently simplify intrinsics that give an approximate answer. + // These include: + // + // - nvvm_cos_approx_{f,ftz_f} + // - nvvm_ex2_approx_{d,f,ftz_f} + // - nvvm_lg2_approx_{d,f,ftz_f} + // - nvvm_sin_approx_{f,ftz_f} + // - nvvm_sqrt_approx_{f,ftz_f} + // - nvvm_rsqrt_approx_{d,f,ftz_f} + // - nvvm_div_approx_{ftz_d,ftz_f,f} + // - nvvm_rcp_approx_ftz_d + // + // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" + // means that fastmath is enabled in the intrinsic. Unfortunately only + // binary operators (currently) have a fastmath bit in SelectionDAG, so this + // information gets lost and we can't select on it. + // + // TODO: div and rcp are lowered to a binary op, so these we could in theory + // lower them to "fast fdiv". + + default: + return {}; + } + }(); + + // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we + // can bail out now. (Notice that in the case that IID is not an NVVM + // intrinsic, we don't have to look up any module metadata, as + // FtzRequirementTy will be FTZ_Any.) + if (Action.FtzRequirement != FTZ_Any) { + bool FtzEnabled = + II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == + "true"; + + if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) + return nullptr; + } + + // Simplify to target-generic intrinsic. + if (Action.IID) { + SmallVector<Value *, 4> Args(II->arg_operands()); + // All the target-generic intrinsics currently of interest to us have one + // type argument, equal to that of the nvvm intrinsic's argument. + Type *Tys[] = {II->getArgOperand(0)->getType()}; + return CallInst::Create( + Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); + } + + // Simplify to target-generic binary op. + if (Action.BinaryOp) + return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), + II->getArgOperand(1), II->getName()); + + // Simplify to target-generic cast op. + if (Action.CastOp) + return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), + II->getName()); + + // All that's left are the special cases. + if (!Action.Special) + return nullptr; + + switch (*Action.Special) { + case SPC_Reciprocal: + // Simplify reciprocal. + return BinaryOperator::Create( + Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), + II->getArgOperand(0), II->getName()); + } + llvm_unreachable("All SpecialCase enumerators should be handled in switch."); +} + Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); return nullptr; @@ -1462,6 +1891,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } + if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) { + if (Constant *C = dyn_cast<Constant>(AMI->getNumElements())) + if (C->isNullValue()) + return eraseInstFromFunction(*AMI); + + if (Instruction *I = SimplifyElementAtomicMemCpy(AMI)) + return I; + } + + if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) + return I; + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) { APInt UndefElts(Width, 0); @@ -1581,8 +2022,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; } - case Intrinsic::fma: case Intrinsic::fmuladd: { + // Canonicalize fast fmuladd to the separate fmul + fadd. + if (II->hasUnsafeAlgebra()) { + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->setFastMathFlags(II->getFastMathFlags()); + Value *Mul = Builder->CreateFMul(II->getArgOperand(0), + II->getArgOperand(1)); + Value *Add = Builder->CreateFAdd(Mul, II->getArgOperand(2)); + Add->takeName(II); + return replaceInstUsesWith(*II, Add); + } + + LLVM_FALLTHROUGH; + } + case Intrinsic::fma: { Value *Src0 = II->getArgOperand(0); Value *Src1 = II->getArgOperand(1); @@ -1631,6 +2085,26 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return SelectInst::Create(Cond, Call0, Call1); } + LLVM_FALLTHROUGH; + } + case Intrinsic::ceil: + case Intrinsic::floor: + case Intrinsic::round: + case Intrinsic::nearbyint: + case Intrinsic::rint: + case Intrinsic::trunc: { + Value *ExtSrc; + if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) && + II->getArgOperand(0)->hasOneUse()) { + // fabs (fpext x) -> fpext (fabs x) + Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(), + { ExtSrc->getType() }); + CallInst *NewFabs = Builder->CreateCall(F, ExtSrc); + NewFabs->copyFastMathFlags(II); + NewFabs->takeName(II); + return new FPExtInst(NewFabs, II->getType()); + } + break; } case Intrinsic::cos: @@ -1863,6 +2337,37 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return II; break; } + case Intrinsic::x86_avx512_mask_cmp_pd_128: + case Intrinsic::x86_avx512_mask_cmp_pd_256: + case Intrinsic::x86_avx512_mask_cmp_pd_512: + case Intrinsic::x86_avx512_mask_cmp_ps_128: + case Intrinsic::x86_avx512_mask_cmp_ps_256: + case Intrinsic::x86_avx512_mask_cmp_ps_512: { + // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + bool Arg0IsZero = match(Arg0, m_Zero()); + if (Arg0IsZero) + std::swap(Arg0, Arg1); + Value *A, *B; + // This fold requires only the NINF(not +/- inf) since inf minus + // inf is nan. + // NSZ(No Signed Zeros) is not needed because zeros of any sign are + // equal for both compares. + // NNAN is not needed because nans compare the same for both compares. + // The compare intrinsic uses the above assumptions and therefore + // doesn't require additional flags. + if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && + match(Arg1, m_Zero()) && + cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { + if (Arg0IsZero) + std::swap(A, B); + II->setArgOperand(0, A); + II->setArgOperand(1, B); + return II; + } + break; + } case Intrinsic::x86_avx512_mask_add_ps_512: case Intrinsic::x86_avx512_mask_div_ps_512: @@ -2130,6 +2635,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_pmulu_dq: case Intrinsic::x86_avx512_pmul_dq_512: case Intrinsic::x86_avx512_pmulu_dq_512: { + if (Value *V = simplifyX86muldq(*II, *Builder)) + return replaceInstUsesWith(*II, V); + unsigned VWidth = II->getType()->getVectorNumElements(); APInt UndefElts(VWidth, 0); APInt DemandedElts = APInt::getAllOnesValue(VWidth); @@ -2141,6 +2649,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + if (Value *V = simplifyX86pack(*II, *this, *Builder, true)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: + if (Value *V = simplifyX86pack(*II, *this, *Builder, false)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_pclmulqdq: { + if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { + unsigned Imm = C->getZExtValue(); + + bool MadeChange = false; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + unsigned VWidth = Arg0->getType()->getVectorNumElements(); + APInt DemandedElts(VWidth, 0); + + APInt UndefElts1(VWidth, 0); + DemandedElts = (Imm & 0x01) ? 2 : 1; + if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts, + UndefElts1)) { + II->setArgOperand(0, V); + MadeChange = true; + } + + APInt UndefElts2(VWidth, 0); + DemandedElts = (Imm & 0x10) ? 2 : 1; + if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts, + UndefElts2)) { + II->setArgOperand(1, V); + MadeChange = true; + } + + // If both input elements are undef, the result is undef. + if (UndefElts1[(Imm & 0x01) ? 1 : 0] || + UndefElts2[(Imm & 0x10) ? 1 : 0]) + return replaceInstUsesWith(*II, + ConstantAggregateZero::get(II->getType())); + + if (MadeChange) + return II; + } + break; + } + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); @@ -2531,9 +3097,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } - case Intrinsic::amdgcn_rcp: { - if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) { + Value *Src = II->getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(CI, Src); + + if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { const APFloat &ArgVal = C->getValueAPF(); APFloat Val(ArgVal.getSemantics(), 1.0); APFloat::opStatus Status = Val.divide(ArgVal, @@ -2546,6 +3117,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::amdgcn_rsq: { + Value *Src = II->getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(CI, Src); + break; + } case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: { Value *Src = II->getArgOperand(0); @@ -2650,6 +3229,274 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); } + case Intrinsic::amdgcn_cvt_pkrtz: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { + if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { + const fltSemantics &HalfSem + = II->getType()->getScalarType()->getFltSemantics(); + bool LosesInfo; + APFloat Val0 = C0->getValueAPF(); + APFloat Val1 = C1->getValueAPF(); + Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + + Constant *Folded = ConstantVector::get({ + ConstantFP::get(II->getContext(), Val0), + ConstantFP::get(II->getContext(), Val1) }); + return replaceInstUsesWith(*II, Folded); + } + } + + if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + break; + } + case Intrinsic::amdgcn_ubfe: + case Intrinsic::amdgcn_sbfe: { + // Decompose simple cases into standard shifts. + Value *Src = II->getArgOperand(0); + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(*II, Src); + + unsigned Width; + Type *Ty = II->getType(); + unsigned IntSize = Ty->getIntegerBitWidth(); + + ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); + if (CWidth) { + Width = CWidth->getZExtValue(); + if ((Width & (IntSize - 1)) == 0) + return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); + + if (Width >= IntSize) { + // Hardware ignores high bits, so remove those. + II->setArgOperand(2, ConstantInt::get(CWidth->getType(), + Width & (IntSize - 1))); + return II; + } + } + + unsigned Offset; + ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); + if (COffset) { + Offset = COffset->getZExtValue(); + if (Offset >= IntSize) { + II->setArgOperand(1, ConstantInt::get(COffset->getType(), + Offset & (IntSize - 1))); + return II; + } + } + + bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe; + + // TODO: Also emit sub if only width is constant. + if (!CWidth && COffset && Offset == 0) { + Constant *KSize = ConstantInt::get(COffset->getType(), IntSize); + Value *ShiftVal = Builder->CreateSub(KSize, II->getArgOperand(2)); + ShiftVal = Builder->CreateZExt(ShiftVal, II->getType()); + + Value *Shl = Builder->CreateShl(Src, ShiftVal); + Value *RightShift = Signed ? + Builder->CreateAShr(Shl, ShiftVal) : + Builder->CreateLShr(Shl, ShiftVal); + RightShift->takeName(II); + return replaceInstUsesWith(*II, RightShift); + } + + if (!CWidth || !COffset) + break; + + // TODO: This allows folding to undef when the hardware has specific + // behavior? + if (Offset + Width < IntSize) { + Value *Shl = Builder->CreateShl(Src, IntSize - Offset - Width); + Value *RightShift = Signed ? + Builder->CreateAShr(Shl, IntSize - Width) : + Builder->CreateLShr(Shl, IntSize - Width); + RightShift->takeName(II); + return replaceInstUsesWith(*II, RightShift); + } + + Value *RightShift = Signed ? + Builder->CreateAShr(Src, Offset) : + Builder->CreateLShr(Src, Offset); + + RightShift->takeName(II); + return replaceInstUsesWith(*II, RightShift); + } + case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_compr: { + ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1)); + if (!En) // Illegal. + break; + + unsigned EnBits = En->getZExtValue(); + if (EnBits == 0xf) + break; // All inputs enabled. + + bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr; + bool Changed = false; + for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { + if ((!IsCompr && (EnBits & (1 << I)) == 0) || + (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { + Value *Src = II->getArgOperand(I + 2); + if (!isa<UndefValue>(Src)) { + II->setArgOperand(I + 2, UndefValue::get(Src->getType())); + Changed = true; + } + } + } + + if (Changed) + return II; + + break; + + } + case Intrinsic::amdgcn_fmed3: { + // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled + // for the shader. + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + Value *Src2 = II->getArgOperand(2); + + bool Swap = false; + // Canonicalize constants to RHS operands. + // + // fmed3(c0, x, c1) -> fmed3(x, c0, c1) + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { + std::swap(Src1, Src2); + Swap = true; + } + + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (Swap) { + II->setArgOperand(0, Src0); + II->setArgOperand(1, Src1); + II->setArgOperand(2, Src2); + return II; + } + + if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { + CallInst *NewCall = Builder->CreateMinNum(Src0, Src1); + NewCall->copyFastMathFlags(II); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { + if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { + if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { + APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), + C2->getValueAPF()); + return replaceInstUsesWith(*II, + ConstantFP::get(Builder->getContext(), Result)); + } + } + } + + break; + } + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: { + const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2)); + if (!CC) + break; + + // Guard against invalid arguments. + int64_t CCVal = CC->getZExtValue(); + bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp; + if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || + CCVal > CmpInst::LAST_ICMP_PREDICATE)) || + (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || + CCVal > CmpInst::LAST_FCMP_PREDICATE))) + break; + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + + if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { + if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { + Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); + return replaceInstUsesWith(*II, + ConstantExpr::getSExt(CCmp, II->getType())); + } + + // Canonicalize constants to RHS. + CmpInst::Predicate SwapPred + = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); + II->setArgOperand(0, Src1); + II->setArgOperand(1, Src0); + II->setArgOperand(2, ConstantInt::get(CC->getType(), + static_cast<int>(SwapPred))); + return II; + } + + if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) + break; + + // Canonicalize compare eq with true value to compare != 0 + // llvm.amdgcn.icmp(zext (i1 x), 1, eq) + // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) + // llvm.amdgcn.icmp(sext (i1 x), -1, eq) + // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) + Value *ExtSrc; + if (CCVal == CmpInst::ICMP_EQ && + ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || + (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && + ExtSrc->getType()->isIntegerTy(1)) { + II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); + II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); + return II; + } + + CmpInst::Predicate SrcPred; + Value *SrcLHS; + Value *SrcRHS; + + // Fold compare eq/ne with 0 from a compare result as the predicate to the + // intrinsic. The typical use is a wave vote function in the library, which + // will be fed from a user code condition compared with 0. Fold in the + // redundant compare. + + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) + // -> llvm.amdgcn.[if]cmp(a, b, pred) + // + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) + // -> llvm.amdgcn.[if]cmp(a, b, inv pred) + if (match(Src1, m_Zero()) && + match(Src0, + m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { + if (CCVal == CmpInst::ICMP_EQ) + SrcPred = CmpInst::getInversePredicate(SrcPred); + + Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? + Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; + + Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID, + SrcLHS->getType()); + Value *Args[] = { SrcLHS, SrcRHS, + ConstantInt::get(CC->getType(), SrcPred) }; + CallInst *NewCall = Builder->CreateCall(NewF, Args); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. @@ -2790,7 +3637,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // isKnownNonNull -> nonnull attribute if (isKnownNonNullAt(DerivedPtr, II, &DT)) - II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); } // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) @@ -2799,11 +3646,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) break; } - } + case Intrinsic::experimental_guard: { + // Is this guard followed by another guard? + Instruction *NextInst = II->getNextNode(); + Value *NextCond = nullptr; + if (match(NextInst, + m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { + Value *CurrCond = II->getArgOperand(0); + + // Remove a guard that it is immediately preceded by an identical guard. + if (CurrCond == NextCond) + return eraseInstFromFunction(*NextInst); + + // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). + II->setArgOperand(0, Builder->CreateAnd(CurrCond, NextCond)); + return eraseInstFromFunction(*NextInst); + } + break; + } + } return visitCallSite(II); } +// Fence instruction simplification +Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { + // Remove identical consecutive fences. + if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode())) + if (FI.isIdenticalTo(NFI)) + return eraseInstFromFunction(FI); + return nullptr; +} + // InvokeInst simplification // Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { @@ -2950,7 +3824,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { for (Value *V : CS.args()) { if (V->getType()->isPointerTy() && - !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + !CS.paramHasAttr(ArgNo, Attribute::NonNull) && isKnownNonNullAt(V, CS.getInstruction(), &DT)) Indices.push_back(ArgNo + 1); ArgNo++; @@ -2959,7 +3833,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { assert(ArgNo == CS.arg_size() && "sanity check"); if (!Indices.empty()) { - AttributeSet AS = CS.getAttributes(); + AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); @@ -3081,7 +3955,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { return false; Instruction *Caller = CS.getInstruction(); - const AttributeSet &CallerPAL = CS.getAttributes(); + const AttributeList &CallerPAL = CS.getAttributes(); // Okay, this is a cast from a function to a different type. Unless doing so // would cause a type conversion of one of our arguments, change this call to @@ -3108,7 +3982,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { } if (!CallerPAL.isEmpty() && !Caller->use_empty()) { - AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex); + AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) return false; // Attribute not compatible with transformed value. } @@ -3149,8 +4023,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) return false; // Cannot transform this parameter value. - if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1). - overlaps(AttributeFuncs::typeIncompatible(ParamTy))) + if (AttrBuilder(CallerPAL.getParamAttributes(i)) + .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. if (CS.isInAllocaArgument(i)) @@ -3158,9 +4032,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // If the parameter is passed as a byval argument, then we have to have a // sized type and the sized type has to have the same size as the old type. - if (ParamTy != ActTy && - CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1, - Attribute::ByVal)) { + if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); if (!ParamPTy || !ParamPTy->getElementType()->isSized()) return false; @@ -3205,7 +4077,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { break; // Check if it has an attribute that's incompatible with varargs. - AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1); + AttributeList PAttrs = CallerPAL.getSlotAttributes(i - 1); if (PAttrs.hasAttribute(Index, Attribute::StructRet)) return false; } @@ -3213,44 +4085,37 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // Okay, we decided that this is a safe thing to do: go ahead and start // inserting cast instructions as necessary. - std::vector<Value*> Args; + SmallVector<Value *, 8> Args; + SmallVector<AttributeSet, 8> ArgAttrs; Args.reserve(NumActualArgs); - SmallVector<AttributeSet, 8> attrVec; - attrVec.reserve(NumCommonArgs); + ArgAttrs.reserve(NumActualArgs); // Get any return attributes. - AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex); + AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); - // Add the new return attributes. - if (RAttrs.hasAttributes()) - attrVec.push_back(AttributeSet::get(Caller->getContext(), - AttributeSet::ReturnIndex, RAttrs)); - AI = CS.arg_begin(); for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { Type *ParamTy = FT->getParamType(i); - if ((*AI)->getType() == ParamTy) { - Args.push_back(*AI); - } else { - Args.push_back(Builder->CreateBitOrPointerCast(*AI, ParamTy)); - } + Value *NewArg = *AI; + if ((*AI)->getType() != ParamTy) + NewArg = Builder->CreateBitOrPointerCast(*AI, ParamTy); + Args.push_back(NewArg); // Add any parameter attributes. - AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1); - if (PAttrs.hasAttributes()) - attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1, - PAttrs)); + ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); } // If the function takes more arguments than the call was taking, add them // now. - for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) + for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { Args.push_back(Constant::getNullValue(FT->getParamType(i))); + ArgAttrs.push_back(AttributeSet()); + } // If we are removing arguments to the function, emit an obnoxious warning. if (FT->getNumParams() < NumActualArgs) { @@ -3259,54 +4124,56 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // Add all of the arguments in their promoted form to the arg list. for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { Type *PTy = getPromotedType((*AI)->getType()); + Value *NewArg = *AI; if (PTy != (*AI)->getType()) { // Must promote to pass through va_arg area! Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, false, PTy, false); - Args.push_back(Builder->CreateCast(opcode, *AI, PTy)); - } else { - Args.push_back(*AI); + NewArg = Builder->CreateCast(opcode, *AI, PTy); } + Args.push_back(NewArg); // Add any parameter attributes. - AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1); - if (PAttrs.hasAttributes()) - attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1, - PAttrs)); + ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); } } } AttributeSet FnAttrs = CallerPAL.getFnAttributes(); - if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex)) - attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs)); if (NewRetTy->isVoidTy()) Caller->setName(""); // Void type should not have a name. - const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(), - attrVec); + assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && + "missing argument attributes"); + LLVMContext &Ctx = Callee->getContext(); + AttributeList NewCallerPAL = AttributeList::get( + Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); - Instruction *NC; + CallSite NewCS; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { - NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles); - NC->takeName(II); - cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); - cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); + NewCS = Builder->CreateInvoke(Callee, II->getNormalDest(), + II->getUnwindDest(), Args, OpBundles); } else { - CallInst *CI = cast<CallInst>(Caller); - NC = Builder->CreateCall(Callee, Args, OpBundles); - NC->takeName(CI); - cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind()); - cast<CallInst>(NC)->setCallingConv(CI->getCallingConv()); - cast<CallInst>(NC)->setAttributes(NewCallerPAL); + NewCS = Builder->CreateCall(Callee, Args, OpBundles); + cast<CallInst>(NewCS.getInstruction()) + ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind()); } + NewCS->takeName(Caller); + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(NewCallerPAL); + + // Preserve the weight metadata for the new call instruction. The metadata + // is used by SamplePGO to check callsite's hotness. + uint64_t W; + if (Caller->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); // Insert a cast of the return type as necessary. + Instruction *NC = NewCS.getInstruction(); Value *NV = NC; if (OldRetTy != NV->getType() && !Caller->use_empty()) { if (!NV->getType()->isVoidTy()) { @@ -3351,7 +4218,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, Value *Callee = CS.getCalledValue(); PointerType *PTy = cast<PointerType>(Callee->getType()); FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); - const AttributeSet &Attrs = CS.getAttributes(); + AttributeList Attrs = CS.getAttributes(); // If the call already has the 'nest' attribute somewhere then give up - // otherwise 'nest' would occur twice after splicing in the chain. @@ -3364,50 +4231,46 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts()); FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType()); - const AttributeSet &NestAttrs = NestF->getAttributes(); + AttributeList NestAttrs = NestF->getAttributes(); if (!NestAttrs.isEmpty()) { - unsigned NestIdx = 1; + unsigned NestArgNo = 0; Type *NestTy = nullptr; AttributeSet NestAttr; // Look for a parameter marked with the 'nest' attribute. for (FunctionType::param_iterator I = NestFTy->param_begin(), - E = NestFTy->param_end(); I != E; ++NestIdx, ++I) - if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) { + E = NestFTy->param_end(); + I != E; ++NestArgNo, ++I) { + AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); + if (AS.hasAttribute(Attribute::Nest)) { // Record the parameter type and any other attributes. NestTy = *I; - NestAttr = NestAttrs.getParamAttributes(NestIdx); + NestAttr = AS; break; } + } if (NestTy) { Instruction *Caller = CS.getInstruction(); std::vector<Value*> NewArgs; + std::vector<AttributeSet> NewArgAttrs; NewArgs.reserve(CS.arg_size() + 1); - - SmallVector<AttributeSet, 8> NewAttrs; - NewAttrs.reserve(Attrs.getNumSlots() + 1); + NewArgAttrs.reserve(CS.arg_size()); // Insert the nest argument into the call argument list, which may // mean appending it. Likewise for attributes. - // Add any result attributes. - if (Attrs.hasAttributes(AttributeSet::ReturnIndex)) - NewAttrs.push_back(AttributeSet::get(Caller->getContext(), - Attrs.getRetAttributes())); - { - unsigned Idx = 1; + unsigned ArgNo = 0; CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); do { - if (Idx == NestIdx) { + if (ArgNo == NestArgNo) { // Add the chain argument and attributes. Value *NestVal = Tramp->getArgOperand(2); if (NestVal->getType() != NestTy) NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest"); NewArgs.push_back(NestVal); - NewAttrs.push_back(AttributeSet::get(Caller->getContext(), - NestAttr)); + NewArgAttrs.push_back(NestAttr); } if (I == E) @@ -3415,23 +4278,13 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // Add the original argument and attributes. NewArgs.push_back(*I); - AttributeSet Attr = Attrs.getParamAttributes(Idx); - if (Attr.hasAttributes(Idx)) { - AttrBuilder B(Attr, Idx); - NewAttrs.push_back(AttributeSet::get(Caller->getContext(), - Idx + (Idx >= NestIdx), B)); - } + NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); - ++Idx; + ++ArgNo; ++I; } while (true); } - // Add any function attributes. - if (Attrs.hasAttributes(AttributeSet::FunctionIndex)) - NewAttrs.push_back(AttributeSet::get(FTy->getContext(), - Attrs.getFnAttributes())); - // The trampoline may have been bitcast to a bogus type (FTy). // Handle this by synthesizing a new function type, equal to FTy // with the chain parameter inserted. @@ -3442,12 +4295,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // Insert the chain's type into the list of parameter types, which may // mean appending it. { - unsigned Idx = 1; + unsigned ArgNo = 0; FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); do { - if (Idx == NestIdx) + if (ArgNo == NestArgNo) // Add the chain's type. NewTypes.push_back(NestTy); @@ -3457,7 +4310,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // Add the original type. NewTypes.push_back(*I); - ++Idx; + ++ArgNo; ++I; } while (true); } @@ -3470,8 +4323,9 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, NestF->getType() == PointerType::getUnqual(NewFTy) ? NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy)); - const AttributeSet &NewPAL = - AttributeSet::get(FTy->getContext(), NewAttrs); + AttributeList NewPAL = + AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), + Attrs.getRetAttributes(), NewArgAttrs); SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); |