src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
commit	706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree	4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/X86/X86TargetTransformInfo.cpp
parent	7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)

vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

Notes

Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')

-rw-r--r--

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

146

1 files changed, 112 insertions, 34 deletions

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 70fd857fcf01..b754836ea517 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

@@ -169,12 +169,13 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {

return 2;

}

-int X86TTIImpl::getArithmeticInstrCost(

- unsigned Opcode, Type *Ty,

- TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,

- TTI::OperandValueProperties Opd1PropInfo,

- TTI::OperandValueProperties Opd2PropInfo,

- ArrayRef<const Value *> Args) {

+int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

+ TTI::OperandValueKind Op1Info,

+ TTI::OperandValueKind Op2Info,

+ TTI::OperandValueProperties Opd1PropInfo,

+ TTI::OperandValueProperties Opd2PropInfo,

+ ArrayRef<const Value *> Args,

+ const Instruction *CxtI) {

// Legalize the type.

std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

@@ -188,7 +189,7 @@ int X86TTIImpl::getArithmeticInstrCost(

{ ISD::FDIV, MVT::v2f64, 65 }, // divpd

};

- if (ST->isGLM())

+ if (ST->useGLMDivSqrtCosts())

if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,

LT.second))

return LT.first * Entry->Cost;

@@ -280,7 +281,7 @@ int X86TTIImpl::getArithmeticInstrCost(

TargetTransformInfo::OP_None,

TargetTransformInfo::OP_None);

- if (ISD == ISD::UREM)

+ else // UREM

return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,

TargetTransformInfo::OP_None,

TargetTransformInfo::OP_None);

@@ -1389,6 +1390,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },

{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },

+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },

{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },

{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },

{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },

@@ -1397,6 +1399,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },

{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },

{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },

+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },

{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },

{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },

{ ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },

@@ -1550,6 +1553,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },

{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB

+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },

{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },

};

@@ -1576,9 +1580,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },

{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },

+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },

+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },

{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },

+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },

{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },

{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },

{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },

@@ -2199,7 +2208,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

MVT MTy = LT.second;

// Attempt to lookup cost.

- if (ST->isGLM())

+ if (ST->useGLMDivSqrtCosts())

if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))

return LT.first * Entry->Cost;

@@ -2374,6 +2383,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,

}

int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {

+ static const CostTblEntry SLMCostTbl[] = {

+ { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },

+ { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },

+ { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },

+ { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }

+ };

assert(Val->isVectorTy() && "This must be a vector type");

Type *ScalarType = Val->getScalarType();

@@ -2390,9 +2406,22 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {

unsigned Width = LT.second.getVectorNumElements();

Index = Index % Width;

- // Floating point scalars are already located in index #0.

- if (ScalarType->isFloatingPointTy() && Index == 0)

- return 0;

+ if (Index == 0) {

+ // Floating point scalars are already located in index #0.

+ if (ScalarType->isFloatingPointTy())

+ return 0;

+ // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.

+ if (ScalarType->isIntegerTy())

+ return 1;

+ }

+ int ISD = TLI->InstructionOpcodeToISD(Opcode);

+ assert(ISD && "Unexpected vector opcode");

+ MVT MScalarTy = LT.second.getScalarType();

+ if (ST->isSLM())

+ if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))

+ return LT.first * Entry->Cost;

}

// Add to the base cost if we know that the extracted element of a vector is

@@ -2404,8 +2433,9 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {

return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;

}

-int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

- unsigned AddressSpace, const Instruction *I) {

+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

+ MaybeAlign Alignment, unsigned AddressSpace,

+ const Instruction *I) {

// Handle non-power-of-two vectors such as <3 x float>

if (VectorType *VTy = dyn_cast<VectorType>(Src)) {

unsigned NumElem = VTy->getVectorNumElements();

@@ -2456,7 +2486,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);

if (!SrcVTy)

// To calculate scalar take the regular cost, without mask

- return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);

+ return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace);

unsigned NumElem = SrcVTy->getVectorNumElements();

VectorType *MaskTy =

@@ -2474,7 +2504,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);

int MemopCost =

NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),

- Alignment, AddressSpace);

+ MaybeAlign(Alignment), AddressSpace);

return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;

}

@@ -2533,6 +2563,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput

// and make it as the cost.

+ static const CostTblEntry SLMCostTblPairWise[] = {

+ { ISD::FADD, MVT::v2f64, 3 },

+ { ISD::ADD, MVT::v2i64, 5 },

+ };

static const CostTblEntry SSE2CostTblPairWise[] = {

{ ISD::FADD, MVT::v2f64, 2 },

{ ISD::FADD, MVT::v4f32, 4 },

@@ -2558,6 +2593,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

{ ISD::ADD, MVT::v32i8, 4 },

};

+ static const CostTblEntry SLMCostTblNoPairWise[] = {

+ { ISD::FADD, MVT::v2f64, 3 },

+ { ISD::ADD, MVT::v2i64, 5 },

+ };

static const CostTblEntry SSE2CostTblNoPairWise[] = {

{ ISD::FADD, MVT::v2f64, 2 },

{ ISD::FADD, MVT::v4f32, 4 },

@@ -2594,6 +2634,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

if (VT.isSimple()) {

MVT MTy = VT.getSimpleVT();

if (IsPairwise) {

+ if (ST->isSLM())

+ if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))

+ return Entry->Cost;

if (ST->hasAVX())

if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))

return Entry->Cost;

@@ -2602,6 +2646,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))

return Entry->Cost;

} else {

+ if (ST->isSLM())

+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))

+ return Entry->Cost;

if (ST->hasAVX())

if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))

return Entry->Cost;

@@ -2617,6 +2665,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

MVT MTy = LT.second;

if (IsPairwise) {

+ if (ST->isSLM())

+ if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

if (ST->hasAVX())

if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))

return LT.first * Entry->Cost;

@@ -2625,6 +2677,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))

return LT.first * Entry->Cost;

} else {

+ if (ST->isSLM())

+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))

+ return LT.first * Entry->Cost;

if (ST->hasAVX())

if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))

return LT.first * Entry->Cost;

@@ -2634,6 +2690,24 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

return LT.first * Entry->Cost;

}

+ // FIXME: These assume a naive kshift+binop lowering, which is probably

+ // conservative in most cases.

+ // FIXME: This doesn't cost large types like v128i1 correctly.

+ static const CostTblEntry AVX512BoolReduction[] = {

+ { ISD::AND, MVT::v2i1, 3 },

+ { ISD::AND, MVT::v4i1, 5 },

+ { ISD::AND, MVT::v8i1, 7 },

+ { ISD::AND, MVT::v16i1, 9 },

+ { ISD::AND, MVT::v32i1, 11 },

+ { ISD::AND, MVT::v64i1, 13 },

+ { ISD::OR, MVT::v2i1, 3 },

+ { ISD::OR, MVT::v4i1, 5 },

+ { ISD::OR, MVT::v8i1, 7 },

+ { ISD::OR, MVT::v16i1, 9 },

+ { ISD::OR, MVT::v32i1, 11 },

+ { ISD::OR, MVT::v64i1, 13 },

+ };

static const CostTblEntry AVX2BoolReduction[] = {

{ ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp

{ ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp

@@ -2664,7 +2738,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,

};

// Handle bool allof/anyof patterns.

- if (ValTy->getVectorElementType()->isIntegerTy(1)) {

+ if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {

+ if (ST->hasAVX512())

+ if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))

+ return LT.first * Entry->Cost;

if (ST->hasAVX2())

if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))

return LT.first * Entry->Cost;

@@ -2956,7 +3033,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {

return std::max(1, Cost);

}

-int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,

+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,

Type *Ty) {

assert(Ty->isIntegerTy());

@@ -3053,8 +3130,8 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,

return X86TTIImpl::getIntImmCost(Imm, Ty);

}

-int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,

- Type *Ty) {

+int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,

+ const APInt &Imm, Type *Ty) {

assert(Ty->isIntegerTy());

unsigned BitSize = Ty->getPrimitiveSizeInBits();

@@ -3164,7 +3241,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,

? ST->getGatherOverhead()

: ST->getScatterOverhead();

return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),

- Alignment, AddressSpace);

+ MaybeAlign(Alignment), AddressSpace);

}

/// Return the cost of full scalarization of gather / scatter operation.

@@ -3194,7 +3271,7 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,

// The cost of the scalar loads/stores.

int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),

- Alignment, AddressSpace);

+ MaybeAlign(Alignment), AddressSpace);

int InsertExtractCost = 0;

if (Opcode == Instruction::Load)

@@ -3224,8 +3301,10 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,

unsigned AddressSpace = PtrTy->getAddressSpace();

bool Scalarize = false;

- if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||

- (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))

+ if ((Opcode == Instruction::Load &&

+ !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) ||

+ (Opcode == Instruction::Store &&

+ !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment))))

Scalarize = true;

// Gather / Scatter for vector 2 is not profitable on KNL / SKX

// Vector-4 of gather/scatter instruction does not exist on KNL.

@@ -3348,7 +3427,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {

return isLegalMaskedExpandLoad(DataTy);

}

-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {

+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {

// Some CPUs have better gather performance than others.

// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only

// enable gather with a -march.

@@ -3386,11 +3465,11 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {

return IntWidth == 32 || IntWidth == 64;

}

-bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {

+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {

// AVX2 doesn't support scatter

if (!ST->hasAVX512())

return false;

- return isLegalMaskedGather(DataType);

+ return isLegalMaskedGather(DataType, Alignment);

}

bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {

@@ -3443,10 +3522,9 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {

// version is not as fast for three way compare (see #33329).

const unsigned PreferredWidth = ST->getPreferVectorWidth();

if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);

- if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);

+ if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);

if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);

- // All GPR and vector loads can be unaligned. SIMD compare requires integer

- // vectors (SSE2/AVX2).

+ // All GPR and vector loads can be unaligned.

Options.AllowOverlappingLoads = true;

}

if (ST->is64Bit()) {

@@ -3520,8 +3598,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,

// Get the cost of one memory operation.

Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),

LegalVT.getVectorNumElements());

- unsigned MemOpCost =

- getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);

+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,

+ MaybeAlign(Alignment), AddressSpace);

VectorType *VT = VectorType::get(ScalarTy, VF);

EVT ETy = TLI->getValueType(DL, VT);

@@ -3620,8 +3698,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,

// Get the cost of one memory operation.

Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),

LegalVT.getVectorNumElements());

- unsigned MemOpCost =

- getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);

+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,

+ MaybeAlign(Alignment), AddressSpace);

unsigned VF = VecTy->getVectorNumElements() / Factor;

MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);