summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp313
1 files changed, 207 insertions, 106 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 223eed3048db..bae2ef80c365 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -130,12 +130,13 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
}
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+ unsigned PreferVectorWidth = ST->getPreferVectorWidth();
if (Vector) {
- if (ST->hasAVX512())
+ if (ST->hasAVX512() && PreferVectorWidth >= 512)
return 512;
- if (ST->hasAVX())
+ if (ST->hasAVX() && PreferVectorWidth >= 256)
return 256;
- if (ST->hasSSE1())
+ if (ST->hasSSE1() && PreferVectorWidth >= 128)
return 128;
return 0;
}
@@ -180,28 +181,40 @@ int X86TTIImpl::getArithmeticInstrCost(
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry GLMCostTable[] = {
+ { ISD::FDIV, MVT::f32, 18 }, // divss
+ { ISD::FDIV, MVT::v4f32, 35 }, // divps
+ { ISD::FDIV, MVT::f64, 33 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 65 }, // divpd
+ };
+
+ if (ST->isGLM())
+ if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SLMCostTable[] = {
- { ISD::MUL, MVT::v4i32, 11 }, // pmulld
- { ISD::MUL, MVT::v8i16, 2 }, // pmullw
- { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
- { ISD::FMUL, MVT::f64, 2 }, // mulsd
- { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
- { ISD::FMUL, MVT::v4f32, 2 }, // mulps
- { ISD::FDIV, MVT::f32, 17 }, // divss
- { ISD::FDIV, MVT::v4f32, 39 }, // divps
- { ISD::FDIV, MVT::f64, 32 }, // divsd
- { ISD::FDIV, MVT::v2f64, 69 }, // divpd
- { ISD::FADD, MVT::v2f64, 2 }, // addpd
- { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
// v2i64/v4i64 mul is custom lowered as a series of long:
// multiplies(3), shifts(3) and adds(2)
// slm muldq version throughput is 2 and addq throughput 4
- // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+ // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
// 3X4 (addq throughput) = 17
- { ISD::MUL, MVT::v2i64, 17 },
+ { ISD::MUL, MVT::v2i64, 17 },
// slm addq\subq throughput is 4
- { ISD::ADD, MVT::v2i64, 4 },
- { ISD::SUB, MVT::v2i64, 4 },
+ { ISD::ADD, MVT::v2i64, 4 },
+ { ISD::SUB, MVT::v2i64, 4 },
};
if (ST->isSLM()) {
@@ -224,30 +237,53 @@ int X86TTIImpl::getArithmeticInstrCost(
if (!signedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
+
if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
LT.second)) {
return LT.first * Entry->Cost;
}
}
- if (ISD == ISD::SDIV &&
- Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
+ ISD == ISD::UREM) &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
- // On X86, vector signed division by constants power-of-two are
- // normally expanded to the sequence SRA + SRL + ADD + SRA.
- // The OperandValue properties many not be same as that of previous
- // operation;conservatively assume OP_None.
- int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
- Op2Info, TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ if (ISD == ISD::SDIV || ISD == ISD::SREM) {
+ // On X86, vector signed division by constants power-of-two are
+ // normally expanded to the sequence SRA + SRL + ADD + SRA.
+ // The OperandValue properties may not be the same as that of the previous
+ // operation; conservatively assume OP_None.
+ int Cost =
+ 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (ISD == ISD::SREM) {
+ // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+ Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+ }
+
+ return Cost;
+ }
- return Cost;
+ // Vector unsigned division/remainder will be simplified to shifts/masks.
+ if (ISD == ISD::UDIV)
+ return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+
+ if (ISD == ISD::UREM)
+ return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
}
static const CostTblEntry AVX512BWUniformConstCostTable[] = {
@@ -256,7 +292,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -272,7 +310,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v8i64, 1 },
{ ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -290,9 +330,13 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -312,13 +356,21 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
{ ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -326,8 +378,12 @@ int X86TTIImpl::getArithmeticInstrCost(
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
return LT.first * 32;
+ if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 38;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
+ if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 20;
// XOP has faster vXi8 shifts.
if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
@@ -405,12 +461,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v64i8, 64*20 },
- { ISD::SDIV, MVT::v32i16, 32*20 },
- { ISD::UDIV, MVT::v64i8, 64*20 },
- { ISD::UDIV, MVT::v32i16, 32*20 }
};
// Look for AVX512BW lowering tricks for custom cases.
@@ -432,14 +482,18 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v16i32, 16*20 },
- { ISD::SDIV, MVT::v8i64, 8*20 },
- { ISD::UDIV, MVT::v16i32, 16*20 },
- { ISD::UDIV, MVT::v8i64, 8*20 }
+ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+
+ { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
@@ -468,7 +522,9 @@ int X86TTIImpl::getArithmeticInstrCost(
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
- return LT.first;
+ return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
@@ -571,9 +627,16 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -617,16 +680,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v32i8, 32*20 },
- { ISD::SDIV, MVT::v16i16, 16*20 },
- { ISD::SDIV, MVT::v8i32, 8*20 },
- { ISD::SDIV, MVT::v4i64, 4*20 },
- { ISD::UDIV, MVT::v32i8, 32*20 },
- { ISD::UDIV, MVT::v16i16, 16*20 },
- { ISD::UDIV, MVT::v8i32, 8*20 },
- { ISD::UDIV, MVT::v4i64, 4*20 },
};
if (ST->hasAVX())
@@ -634,6 +687,21 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
+
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
@@ -666,7 +734,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
@@ -703,21 +771,6 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
-
- // It is not a good idea to vectorize division. We have to scalarize it and
- // in the process we will often end up having to spilling regular
- // registers. The overhead of division is going to dominate most kernels
- // anyways so try hard to prevent vectorization of division - it is
- // generally a bad idea. Assume somewhat arbitrarily that we have to be able
- // to hide "20 cycles" for each lane.
- { ISD::SDIV, MVT::v16i8, 16*20 },
- { ISD::SDIV, MVT::v8i16, 8*20 },
- { ISD::SDIV, MVT::v4i32, 4*20 },
- { ISD::SDIV, MVT::v2i64, 2*20 },
- { ISD::UDIV, MVT::v16i8, 16*20 },
- { ISD::UDIV, MVT::v8i16, 8*20 },
- { ISD::UDIV, MVT::v4i32, 4*20 },
- { ISD::UDIV, MVT::v2i64, 2*20 },
};
if (ST->hasSSE2())
@@ -733,6 +786,20 @@ int X86TTIImpl::getArithmeticInstrCost(
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
+ ISD == ISD::UDIV || ISD == ISD::UREM)) {
+ int ScalarCost = getArithmeticInstrCost(
+ Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
+ }
+
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
}
@@ -754,7 +821,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// type remains the same.
if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
MVT LegalVT = LT.second;
- if (LegalVT.getVectorElementType().getSizeInBits() ==
+ if (LegalVT.isVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
@@ -886,8 +954,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
{ TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
- { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
- { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
+ { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb
+ { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb
{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
@@ -951,15 +1019,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
// + vinsertf128
- { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
- { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd
+ { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd
+ { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps
+ { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps
+ { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
@@ -967,9 +1035,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
{ TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd
{ TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
{ TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
@@ -982,12 +1050,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSE41ShuffleTbl[] = {
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
- { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
+ { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v4f32, 1 }, // blendps
+ { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw
+ { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb
};
if (ST->hasSSE41())
@@ -1001,8 +1069,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
- { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
+ { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por
+ { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por
{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
@@ -1029,11 +1097,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
- { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
+ { TTI::SK_Select, MVT::v2i64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps
+ { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por
+ { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
@@ -1057,7 +1125,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
- { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
+ { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
};
@@ -1487,6 +1555,15 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SETCC, MVT::v16f32, 1 },
};
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::SETCC, MVT::v32i16, 1 },
+ { ISD::SETCC, MVT::v64i8, 1 },
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1631,6 +1708,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
};
+ static const CostTblEntry GLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
+ };
+ static const CostTblEntry SLMCostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
+ { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
+ { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
+ { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
+ };
static const CostTblEntry SSE42CostTbl[] = {
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
@@ -1721,6 +1810,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ST->isGLM())
+ if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasCDI())
if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1886,8 +1983,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+ Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -2145,7 +2242,7 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
}
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
int X86TTIImpl::getIntImmCost(int64_t Val) {
@@ -2480,6 +2577,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+bool X86TTIImpl::canMacroFuseCmp() {
+ return ST->hasMacroFusion();
+}
+
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
// The backend can't handle a single element vector.
if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
@@ -2522,7 +2623,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
return (DataWidth == 32 || DataWidth == 64) &&
- (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
+ (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {