summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp630
1 files changed, 303 insertions, 327 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index d7792e296a58e..de4839432b9ad 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -80,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
if (Vector) {
- if (ST->hasAVX512()) return 512;
- if (ST->hasAVX()) return 256;
- if (ST->hasSSE1()) return 128;
+ if (ST->hasAVX512())
+ return 512;
+ if (ST->hasAVX())
+ return 256;
+ if (ST->hasSSE1())
+ return 128;
return 0;
}
@@ -211,11 +214,9 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for AVX512DQ lowering tricks for custom cases.
- if (ST->hasDQI()) {
- if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
- LT.second))
+ if (ST->hasDQI())
+ if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVX512BWCostTable[] = {
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
@@ -225,37 +226,38 @@ int X86TTIImpl::getArithmeticInstrCost(
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v64i8, 64*20 },
{ ISD::SDIV, MVT::v32i16, 32*20 },
- { ISD::SDIV, MVT::v16i32, 16*20 },
- { ISD::SDIV, MVT::v8i64, 8*20 },
{ ISD::UDIV, MVT::v64i8, 64*20 },
- { ISD::UDIV, MVT::v32i16, 32*20 },
- { ISD::UDIV, MVT::v16i32, 16*20 },
- { ISD::UDIV, MVT::v8i64, 8*20 },
+ { ISD::UDIV, MVT::v32i16, 32*20 }
};
// Look for AVX512BW lowering tricks for custom cases.
- if (ST->hasBWI()) {
- if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
- LT.second))
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVX512CostTable[] = {
- { ISD::SHL, MVT::v16i32, 1 },
- { ISD::SRL, MVT::v16i32, 1 },
- { ISD::SRA, MVT::v16i32, 1 },
- { ISD::SHL, MVT::v8i64, 1 },
- { ISD::SRL, MVT::v8i64, 1 },
- { ISD::SRA, MVT::v8i64, 1 },
-
- { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
+ { ISD::SHL, MVT::v16i32, 1 },
+ { ISD::SRL, MVT::v16i32, 1 },
+ { ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v8i64, 1 },
+ { ISD::SRL, MVT::v8i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v16i32, 16*20 },
+ { ISD::SDIV, MVT::v8i64, 8*20 },
+ { ISD::UDIV, MVT::v16i32, 16*20 },
+ { ISD::UDIV, MVT::v8i64, 8*20 }
};
- if (ST->hasAVX512()) {
+ if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVX2CostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
@@ -315,10 +317,9 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for XOP lowering tricks.
- if (ST->hasXOP()) {
+ if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVX2CustomCostTable[] = {
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
@@ -334,6 +335,8 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
@@ -344,11 +347,10 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX2()) {
+ if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVXCustomCostTable[] = {
{ ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
@@ -372,24 +374,10 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX()) {
+ if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
- }
-
- static const CostTblEntry SSE42FloatCostTable[] = {
- { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
- };
-
- if (ST->hasSSE42()) {
- if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
- }
static const CostTblEntry
SSE2UniformCostTable[] = {
@@ -452,6 +440,17 @@ int X86TTIImpl::getArithmeticInstrCost(
ISD = ISD::MUL;
}
+ static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+ };
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SSE41CostTable[] = {
{ ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
@@ -471,44 +470,39 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
+
+ { ISD::MUL, MVT::v4i32, 1 } // pmulld
};
- if (ST->hasSSE41()) {
+ if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
{ ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
{ ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
{ ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
{ ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
@@ -531,10 +525,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::UDIV, MVT::v2i64, 2*20 },
};
- if (ST->hasSSE2()) {
+ if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
static const CostTblEntry AVX1CostTable[] = {
// We don't have to scalarize unsupported ops. We can issue two half-sized
@@ -553,307 +546,278 @@ int X86TTIImpl::getArithmeticInstrCost(
// A v4i64 multiply is custom lowered as two split v2i64 vectors that then
// are lowered as a series of long multiplies(3), shifts(3) and adds(2)
// Because we believe v4i64 to be a legal type, we must also include the
- // split factor of two in the cost table. Therefore, the cost here is 16
+ // extract+insert in the cost table. Therefore, the cost here is 18
// instead of 8.
- { ISD::MUL, MVT::v4i64, 16 },
+ { ISD::MUL, MVT::v4i64, 18 },
};
// Look for AVX1 lowering tricks.
- if (ST->hasAVX() && !ST->hasAVX2()) {
- MVT VT = LT.second;
-
- if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+ if (ST->hasAVX() && !ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
- // Custom lowering of vectors.
- static const CostTblEntry CustomLowered[] = {
- // A v2i64/v4i64 and multiply is custom lowered as a series of long
- // multiplies(3), shifts(3) and adds(2).
- { ISD::MUL, MVT::v2i64, 8 },
- { ISD::MUL, MVT::v4i64, 8 },
- { ISD::MUL, MVT::v8i64, 8 }
- };
- if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
- return LT.first * Entry->Cost;
-
- // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
- // 2x pmuludq, 2x shuffle.
- if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
- !ST->hasSSE41())
- return LT.first * 6;
-
- static const CostTblEntry SSE1FloatCostTable[] = {
+ static const CostTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD,
- LT.second))
+ if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
+
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
}
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
- // 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- static const CostTblEntry AVX512VBMIShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
- { TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb
- };
-
- if (ST->hasVBMI())
- if (const auto *Entry =
- CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // For Broadcasts we are splatting the first element from the first input
+ // register, so only need to reference that input and all the output
+ // registers are the same.
+ if (Kind == TTI::SK_Broadcast)
+ LT.first = 1;
+
+ // We are going to permute multiple sources and the result will be in multiple
+ // destinations. Providing an accurate cost only for splits where the element
+ // type remains the same.
+ if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+ MVT LegalVT = LT.second;
+ if (LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+ unsigned NumOfDests = LT.first;
+
+ Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+
+ unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles *
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ }
- static const CostTblEntry AVX512BWShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
- // + 2*pshufb + vinserti64x4
- };
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
- if (ST->hasBWI())
- if (const auto *Entry =
- CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+ if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
+ // We assume that source and destination have the same vector type.
+ int NumOfDests = LT.first;
+ int NumOfShufflesPerDest = LT.first * 2 - 1;
+ LT.first = NumOfDests * NumOfShufflesPerDest;
+ }
- static const CostTblEntry AVX512ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
- { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
- { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
- { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
- };
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
+ { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
- if (ST->hasAVX512())
- if (const auto *Entry =
- CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
- static const CostTblEntry AVX2ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
+ { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
+ };
- { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
- { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
- };
+ if (ST->hasVBMI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, 6 }, // vextracti64x4 + 2*vperm2i128
+ // + 2*pshufb + vinserti64x4
+
+ { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
+
+ { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
+ { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
+ };
- static const CostTblEntry AVX1ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
- { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
- { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
- // + vinsertf128
- { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
- // + vinsertf128
-
- { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
- { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
- { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
- { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
- };
+ if (ST->hasBWI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
+
+ { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
+
+ { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+
+ { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
+ { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
+ };
- static const CostTblEntry SSE41ShuffleTbl[] = {
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
- { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
- { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
- };
-
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry SSSE3ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
- { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
+ static const CostTblEntry AVX2ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
+ { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
+
+ { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+ { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
+ };
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
- { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
- };
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- if (ST->hasSSSE3())
- if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX1ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+ { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
+
+ { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
+ };
- static const CostTblEntry SSE2ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
- { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
- { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
- { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
- // + 2*pshufd + 2*unpck + packus
-
- { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
- { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
- { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
- };
-
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry SSE1ShuffleTbl[] = {
- { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
- { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
- };
+ static const CostTblEntry SSE41ShuffleTbl[] = {
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
+ { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
+ };
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- } else if (Kind == TTI::SK_PermuteTwoSrc) {
- // We assume that source and destination have the same vector type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- int NumOfDests = LT.first;
- int NumOfShufflesPerDest = LT.first * 2 - 1;
- int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
-
- static const CostTblEntry AVX512VBMIShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b
- };
-
- if (ST->hasVBMI())
- if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return NumOfShuffles * Entry->Cost;
-
- static const CostTblEntry AVX512BWShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
- {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc
- {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc
- };
-
- if (ST->hasBWI())
- if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return NumOfShuffles * Entry->Cost;
-
- static const CostTblEntry AVX512ShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd
- {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
- {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q
- {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
- {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd
- {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps
- {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q
- {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d
- };
+ static const CostTblEntry SSSE3ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
- if (ST->hasAVX512())
- if (const auto *Entry =
- CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
- return NumOfShuffles * Entry->Cost;
-
- } else if (Kind == TTI::SK_PermuteSingleSrc) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- if (LT.first == 1) {
-
- static const CostTblEntry AVX512VBMIShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb
- };
-
- if (ST->hasVBMI())
- if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return Entry->Cost;
-
- static const CostTblEntry AVX512BWShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
- {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw
- {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc
- };
-
- if (ST->hasBWI())
- if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return Entry->Cost;
-
- static const CostTblEntry AVX512ShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd
- {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd
- {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
- {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps
- {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq
- {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq
- {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
- {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb
- };
-
- if (ST->hasAVX512())
- if (const auto *Entry =
- CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
- return Entry->Cost;
-
- } else {
- // We are going to permute multiple sources and the result will be in
- // multiple destinations. Providing an accurate cost only for splits where
- // the element type remains the same.
-
- MVT LegalVT = LT.second;
- if (LegalVT.getVectorElementType().getSizeInBits() ==
- Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
- LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
-
- unsigned VecTySize = DL.getTypeStoreSize(Tp);
- unsigned LegalVTSize = LegalVT.getStoreSize();
- // Number of source vectors after legalization:
- unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
- // Number of destination vectors after legalization:
- unsigned NumOfDests = LT.first;
-
- Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
- LegalVT.getVectorNumElements());
-
- unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
- return NumOfShuffles *
- getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
- }
- }
- }
+ { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
+
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE2ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
+ { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
+
+ { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
+ { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE1ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
+ };
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
@@ -1623,17 +1587,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
return Cost+LT.first;
}
-int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
- if (Ty->isVectorTy() && IsComplex)
- return NumVectorInstToHideOverhead;
+ // Cost modeling of Strided Access Computation is hidden by the indexing
+ // modes of X86 regardless of the stride value. We dont believe that there
+ // is a difference between constant strided access in gerenal and constant
+ // strided value which is less than or equal to 64.
+ // Even in the case of (loop invariant) stride whose value is not known at
+ // compile time, the address computation will not incur more than one extra
+ // ADD instruction.
+ if (Ty->isVectorTy() && SE) {
+ if (!BaseT::isStridedAccess(Ptr))
+ return NumVectorInstToHideOverhead;
+ if (!BaseT::getConstantStrideStep(SE, Ptr))
+ return 1;
+ }
- return BaseT::getAddressComputationCost(Ty, IsComplex);
+ return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,