diff options
Diffstat (limited to 'include/llvm/CodeGen/BasicTTIImpl.h')
-rw-r--r-- | include/llvm/CodeGen/BasicTTIImpl.h | 215 |
1 files changed, 167 insertions, 48 deletions
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index d99054eb6f368..69951afb623c3 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -105,6 +105,11 @@ public: /// \name Scalar TTI Implementations /// @{ + bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace, + unsigned Alignment, bool *Fast) const { + MVT M = MVT::getIntegerVT(BitWidth); + return getTLI()->allowsMisalignedMemoryAccesses(M, AddressSpace, Alignment, Fast); + } bool hasBranchDivergence() { return false; } @@ -152,6 +157,11 @@ public: return getTLI()->isTypeLegal(VT); } + int getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef<const Value *> Operands) { + return BaseT::getGEPCost(PointeeType, Ptr, Operands); + } + unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) { return BaseT::getIntrinsicCost(IID, RetTy, Arguments); @@ -216,6 +226,8 @@ public: return BaseT::getOperationCost(Opcode, Ty, OpTy); } + unsigned getInliningThresholdMultiplier() { return 1; } + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { // This unrolling functionality is target independent, but to provide some // motivation for its intended use, for x86: @@ -307,12 +319,14 @@ public: } if (!TLI->isOperationExpand(ISD, LT.second)) { - // If the operation is custom lowered then assume - // thare the code is twice as expensive. + // If the operation is custom lowered, then assume that the code is twice + // as expensive. return LT.first * 2 * OpCost; } // Else, assume that we need to scalarize this op. + // TODO: If one of the types get legalized by splitting, handle this + // similarly to what getCastInstrCost() does. if (Ty->isVectorTy()) { unsigned Num = Ty->getVectorNumElements(); unsigned Cost = static_cast<T *>(this) @@ -359,6 +373,11 @@ public: TLI->isZExtFree(SrcLT.second, DstLT.second)) return 0; + if (Opcode == Instruction::AddrSpaceCast && + TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(), + Dst->getPointerAddressSpace())) + return 0; + // If the cast is marked as legal (or promote) then assume low cost. if (SrcLT.first == DstLT.first && TLI->isOperationLegalOrPromote(ISD, DstLT.second)) @@ -402,9 +421,25 @@ public: return SrcLT.first * 1; } - // If we are converting vectors and the operation is illegal, or - // if the vectors are legalized to different types, estimate the - // scalarization costs. + // If we are legalizing by splitting, query the concrete TTI for the cost + // of casting the original vector twice. We also need to factor int the + // cost of the split itself. Count that as 1, to be consistent with + // TLI->getTypeLegalizationCost(). + if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == + TargetLowering::TypeSplitVector) || + (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == + TargetLowering::TypeSplitVector)) { + Type *SplitDst = VectorType::get(Dst->getVectorElementType(), + Dst->getVectorNumElements() / 2); + Type *SplitSrc = VectorType::get(Src->getVectorElementType(), + Src->getVectorNumElements() / 2); + T *TTI = static_cast<T *>(this); + return TTI->getVectorSplitCost() + + (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc)); + } + + // In other cases where the source or destination are illegal, assume + // the operation will get scalarized. unsigned Num = Dst->getVectorNumElements(); unsigned Cost = static_cast<T *>(this)->getCastInstrCost( Opcode, Dst->getScalarType(), Src->getScalarType()); @@ -428,6 +463,14 @@ public: llvm_unreachable("Unhandled cast"); } + unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, + VectorType *VecTy, unsigned Index) { + return static_cast<T *>(this)->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Index) + + static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, + VecTy->getElementType()); + } + unsigned getCFInstrCost(unsigned Opcode) { // Branches are assumed to be predicted. return 0; @@ -454,6 +497,8 @@ public: } // Otherwise, assume that the cast is scalarized. + // TODO: If one of the types get legalized by splitting, handle this + // similarly to what getCastInstrCost() does. if (ValTy->isVectorTy()) { unsigned Num = ValTy->getVectorNumElements(); if (CondTy) @@ -462,8 +507,7 @@ public: Opcode, ValTy->getScalarType(), CondTy); // Return the cost of multiple scalar invocation plus the cost of - // inserting - // and extracting the values. + // inserting and extracting the values. return getScalarizationOverhead(ValTy, true, false) + Num * Cost; } @@ -527,6 +571,51 @@ public: unsigned Cost = static_cast<T *>(this)->getMemoryOpCost( Opcode, VecTy, Alignment, AddressSpace); + // Legalize the vector type, and get the legalized and unlegalized type + // sizes. + MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + unsigned VecTySize = + static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); + unsigned VecTyLTSize = VecTyLT.getStoreSize(); + + // Return the ceiling of dividing A by B. + auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; + + // Scale the cost of the memory operation by the fraction of legalized + // instructions that will actually be used. We shouldn't account for the + // cost of dead instructions since they will be removed. + // + // E.g., An interleaved load of factor 8: + // %vec = load <16 x i64>, <16 x i64>* %ptr + // %v0 = shufflevector %vec, undef, <0, 8> + // + // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be + // used (those corresponding to elements [0:1] and [8:9] of the unlegalized + // type). The other loads are unused. + // + // We only scale the cost of loads since interleaved store groups aren't + // allowed to have gaps. + if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { + + // The number of loads of a legal type it will take to represent a load + // of the unlegalized vector type. + unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); + + // The number of elements of the unlegalized type that correspond to a + // single legal instruction. + unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); + + // Determine which legal instructions will be used. + BitVector UsedInsts(NumLegalInsts, false); + for (unsigned Index : Indices) + for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) + UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); + + // Scale the cost of the load by the fraction of legal instructions that + // will be used. + Cost *= UsedInsts.count() / NumLegalInsts; + } + // Then plus the cost of interleave operation. if (Opcode == Instruction::Load) { // The interleave cost is similar to extract sub vectors' elements @@ -582,13 +671,14 @@ public: /// Get intrinsic cost based on arguments unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args) { + ArrayRef<Value *> Args, FastMathFlags FMF) { switch (IID) { default: { SmallVector<Type *, 4> Types; for (Value *Op : Args) Types.push_back(Op->getType()); - return getIntrinsicInstrCost(IID, RetTy, Types); + return static_cast<T *>(this)->getIntrinsicInstrCost(IID, RetTy, Types, + FMF); } case Intrinsic::masked_scatter: { Value *Mask = Args[3]; @@ -614,8 +704,9 @@ public: /// Get intrinsic cost based on argument types unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys) { - unsigned ISD = 0; + ArrayRef<Type *> Tys, FastMathFlags FMF) { + SmallVector<unsigned, 2> ISDs; + unsigned SingleCallCost = 10; // Library call cost. Make it expensive. switch (IID) { default: { // Assume that we need to scalarize this intrinsic. @@ -641,74 +732,78 @@ public: return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( - IID, ScalarRetTy, ScalarTys); + IID, ScalarRetTy, ScalarTys, FMF); return ScalarCalls * ScalarCost + ScalarizationCost; } // Look for intrinsics that can be lowered directly or turned into a scalar // intrinsic call. case Intrinsic::sqrt: - ISD = ISD::FSQRT; + ISDs.push_back(ISD::FSQRT); break; case Intrinsic::sin: - ISD = ISD::FSIN; + ISDs.push_back(ISD::FSIN); break; case Intrinsic::cos: - ISD = ISD::FCOS; + ISDs.push_back(ISD::FCOS); break; case Intrinsic::exp: - ISD = ISD::FEXP; + ISDs.push_back(ISD::FEXP); break; case Intrinsic::exp2: - ISD = ISD::FEXP2; + ISDs.push_back(ISD::FEXP2); break; case Intrinsic::log: - ISD = ISD::FLOG; + ISDs.push_back(ISD::FLOG); break; case Intrinsic::log10: - ISD = ISD::FLOG10; + ISDs.push_back(ISD::FLOG10); break; case Intrinsic::log2: - ISD = ISD::FLOG2; + ISDs.push_back(ISD::FLOG2); break; case Intrinsic::fabs: - ISD = ISD::FABS; + ISDs.push_back(ISD::FABS); break; case Intrinsic::minnum: - ISD = ISD::FMINNUM; + ISDs.push_back(ISD::FMINNUM); + if (FMF.noNaNs()) + ISDs.push_back(ISD::FMINNAN); break; case Intrinsic::maxnum: - ISD = ISD::FMAXNUM; + ISDs.push_back(ISD::FMAXNUM); + if (FMF.noNaNs()) + ISDs.push_back(ISD::FMAXNAN); break; case Intrinsic::copysign: - ISD = ISD::FCOPYSIGN; + ISDs.push_back(ISD::FCOPYSIGN); break; case Intrinsic::floor: - ISD = ISD::FFLOOR; + ISDs.push_back(ISD::FFLOOR); break; case Intrinsic::ceil: - ISD = ISD::FCEIL; + ISDs.push_back(ISD::FCEIL); break; case Intrinsic::trunc: - ISD = ISD::FTRUNC; + ISDs.push_back(ISD::FTRUNC); break; case Intrinsic::nearbyint: - ISD = ISD::FNEARBYINT; + ISDs.push_back(ISD::FNEARBYINT); break; case Intrinsic::rint: - ISD = ISD::FRINT; + ISDs.push_back(ISD::FRINT); break; case Intrinsic::round: - ISD = ISD::FROUND; + ISDs.push_back(ISD::FROUND); break; case Intrinsic::pow: - ISD = ISD::FPOW; + ISDs.push_back(ISD::FPOW); break; case Intrinsic::fma: - ISD = ISD::FMA; + ISDs.push_back(ISD::FMA); break; case Intrinsic::fmuladd: - ISD = ISD::FMA; + ISDs.push_back(ISD::FMA); break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: @@ -720,27 +815,49 @@ public: case Intrinsic::masked_load: return static_cast<T *>(this) ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); + case Intrinsic::ctpop: + ISDs.push_back(ISD::CTPOP); + // In case of legalization use TCC_Expensive. This is cheaper than a + // library call but still not a cheap instruction. + SingleCallCost = TargetTransformInfo::TCC_Expensive; + break; + // FIXME: ctlz, cttz, ... } const TargetLoweringBase *TLI = getTLI(); std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); - if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { - // The operation is legal. Assume it costs 1. - // If the type is split to multiple registers, assume that there is some - // overhead to this. - // TODO: Once we have extract/insert subvector cost we need to use them. - if (LT.first > 1) - return LT.first * 2; - return LT.first * 1; - } + SmallVector<unsigned, 2> LegalCost; + SmallVector<unsigned, 2> CustomCost; + for (unsigned ISD : ISDs) { + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + if (IID == Intrinsic::fabs && TLI->isFAbsFree(LT.second)) { + return 0; + } - if (!TLI->isOperationExpand(ISD, LT.second)) { - // If the operation is custom lowered then assume - // thare the code is twice as expensive. - return LT.first * 2; + // The operation is legal. Assume it costs 1. + // If the type is split to multiple registers, assume that there is some + // overhead to this. + // TODO: Once we have extract/insert subvector cost we need to use them. + if (LT.first > 1) + LegalCost.push_back(LT.first * 2); + else + LegalCost.push_back(LT.first * 1); + } else if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered then assume + // that the code is twice as expensive. + CustomCost.push_back(LT.first * 2); + } } + auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); + if (MinLegalCostI != LegalCost.end()) + return *MinLegalCostI; + + auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end()); + if (MinCustomCostI != CustomCost.end()) + return *MinCustomCostI; + // If we can't lower fmuladd into an FMA estimate the cost as a floating // point mul followed by an add. if (IID == Intrinsic::fmuladd) @@ -763,7 +880,7 @@ public: ScalarTys.push_back(Ty); } unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( - IID, RetTy->getScalarType(), ScalarTys); + IID, RetTy->getScalarType(), ScalarTys, FMF); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (Tys[i]->isVectorTy()) { ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); @@ -775,7 +892,7 @@ public: } // This is going to be turned into a library call, make it expensive. - return 10; + return SingleCallCost; } /// \brief Compute a cost of the given call instruction. @@ -815,6 +932,8 @@ public: return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); } + unsigned getVectorSplitCost() { return 1; } + /// @} }; |