diff options
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 313 |
1 files changed, 207 insertions, 106 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 223eed3048db..bae2ef80c365 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -130,12 +130,13 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { } unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { + unsigned PreferVectorWidth = ST->getPreferVectorWidth(); if (Vector) { - if (ST->hasAVX512()) + if (ST->hasAVX512() && PreferVectorWidth >= 512) return 512; - if (ST->hasAVX()) + if (ST->hasAVX() && PreferVectorWidth >= 256) return 256; - if (ST->hasSSE1()) + if (ST->hasSSE1() && PreferVectorWidth >= 128) return 128; return 0; } @@ -180,28 +181,40 @@ int X86TTIImpl::getArithmeticInstrCost( int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry GLMCostTable[] = { + { ISD::FDIV, MVT::f32, 18 }, // divss + { ISD::FDIV, MVT::v4f32, 35 }, // divps + { ISD::FDIV, MVT::f64, 33 }, // divsd + { ISD::FDIV, MVT::v2f64, 65 }, // divpd + }; + + if (ST->isGLM()) + if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry SLMCostTable[] = { - { ISD::MUL, MVT::v4i32, 11 }, // pmulld - { ISD::MUL, MVT::v8i16, 2 }, // pmullw - { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. - { ISD::FMUL, MVT::f64, 2 }, // mulsd - { ISD::FMUL, MVT::v2f64, 4 }, // mulpd - { ISD::FMUL, MVT::v4f32, 2 }, // mulps - { ISD::FDIV, MVT::f32, 17 }, // divss - { ISD::FDIV, MVT::v4f32, 39 }, // divps - { ISD::FDIV, MVT::f64, 32 }, // divsd - { ISD::FDIV, MVT::v2f64, 69 }, // divpd - { ISD::FADD, MVT::v2f64, 2 }, // addpd - { ISD::FSUB, MVT::v2f64, 2 }, // subpd + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd // v2i64/v4i64 mul is custom lowered as a series of long: // multiplies(3), shifts(3) and adds(2) // slm muldq version throughput is 2 and addq throughput 4 - // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + + // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + // 3X4 (addq throughput) = 17 - { ISD::MUL, MVT::v2i64, 17 }, + { ISD::MUL, MVT::v2i64, 17 }, // slm addq\subq throughput is 4 - { ISD::ADD, MVT::v2i64, 4 }, - { ISD::SUB, MVT::v2i64, 4 }, + { ISD::ADD, MVT::v2i64, 4 }, + { ISD::SUB, MVT::v2i64, 4 }, }; if (ST->isSLM()) { @@ -224,30 +237,53 @@ int X86TTIImpl::getArithmeticInstrCost( if (!signedMode && OpMinSize <= 16) return LT.first * 5; // pmullw/pmulhw/pshuf } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) { return LT.first * Entry->Cost; } } - if (ISD == ISD::SDIV && - Op2Info == TargetTransformInfo::OK_UniformConstantValue && + if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || + ISD == ISD::UREM) && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { - // On X86, vector signed division by constants power-of-two are - // normally expanded to the sequence SRA + SRL + ADD + SRA. - // The OperandValue properties many not be same as that of previous - // operation;conservatively assume OP_None. - int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, - Op2Info, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + if (ISD == ISD::SDIV || ISD == ISD::SREM) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties may not be the same as that of the previous + // operation; conservatively assume OP_None. + int Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + if (ISD == ISD::SREM) { + // For SREM: (X % C) is the equivalent of (X - (X/C)*C) + Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info); + } + + return Cost; + } - return Cost; + // Vector unsigned division/remainder will be simplified to shifts/masks. + if (ISD == ISD::UDIV) + return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + if (ISD == ISD::UREM) + return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); } static const CostTblEntry AVX512BWUniformConstCostTable[] = { @@ -256,7 +292,9 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence + { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence + { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -272,7 +310,9 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v8i64, 1 }, { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence + { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence + { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -290,9 +330,13 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence + { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence + { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence + { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence + { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -312,13 +356,21 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. + { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. + { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. + { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. + { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -326,8 +378,12 @@ int X86TTIImpl::getArithmeticInstrCost( // pmuldq sequence. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) return LT.first * 32; + if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) + return LT.first * 38; if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; + if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 20; // XOP has faster vXi8 shifts. if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || @@ -405,12 +461,6 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. - - // Vectorizing division is a bad idea. See the SSE2 table for more comments. - { ISD::SDIV, MVT::v64i8, 64*20 }, - { ISD::SDIV, MVT::v32i16, 32*20 }, - { ISD::UDIV, MVT::v64i8, 64*20 }, - { ISD::UDIV, MVT::v32i16, 32*20 } }; // Look for AVX512BW lowering tricks for custom cases. @@ -432,14 +482,18 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i32, 1 }, // pmulld + { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) + { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) + { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add - // Vectorizing division is a bad idea. See the SSE2 table for more comments. - { ISD::SDIV, MVT::v16i32, 16*20 }, - { ISD::SDIV, MVT::v8i64, 8*20 }, - { ISD::UDIV, MVT::v16i32, 16*20 }, - { ISD::UDIV, MVT::v8i64, 8*20 } + { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ + { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ + { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ + + { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ + { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ + { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ }; if (ST->hasAVX512()) @@ -468,7 +522,9 @@ int X86TTIImpl::getArithmeticInstrCost( Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) // On AVX2, a packed v16i16 shift left by a constant build_vector // is lowered into a vector multiply (vpmullw). - return LT.first; + return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; @@ -571,9 +627,16 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i16, 1 }, // pmullw - { ISD::MUL, MVT::v8i32, 1 }, // pmulld + { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add + { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ + { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ @@ -617,16 +680,6 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ - - // Vectorizing division is a bad idea. See the SSE2 table for more comments. - { ISD::SDIV, MVT::v32i8, 32*20 }, - { ISD::SDIV, MVT::v16i16, 16*20 }, - { ISD::SDIV, MVT::v8i32, 8*20 }, - { ISD::SDIV, MVT::v4i64, 4*20 }, - { ISD::UDIV, MVT::v32i8, 32*20 }, - { ISD::UDIV, MVT::v16i16, 16*20 }, - { ISD::UDIV, MVT::v8i32, 8*20 }, - { ISD::UDIV, MVT::v4i64, 4*20 }, }; if (ST->hasAVX()) @@ -634,6 +687,21 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry SSE42CostTable[] = { + { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ + + { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ + { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ + + { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ @@ -666,7 +734,7 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. - { ISD::MUL, MVT::v4i32, 1 } // pmulld + { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) }; if (ST->hasSSE41()) @@ -703,21 +771,6 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ - - // It is not a good idea to vectorize division. We have to scalarize it and - // in the process we will often end up having to spilling regular - // registers. The overhead of division is going to dominate most kernels - // anyways so try hard to prevent vectorization of division - it is - // generally a bad idea. Assume somewhat arbitrarily that we have to be able - // to hide "20 cycles" for each lane. - { ISD::SDIV, MVT::v16i8, 16*20 }, - { ISD::SDIV, MVT::v8i16, 8*20 }, - { ISD::SDIV, MVT::v4i32, 4*20 }, - { ISD::SDIV, MVT::v2i64, 2*20 }, - { ISD::UDIV, MVT::v16i8, 16*20 }, - { ISD::UDIV, MVT::v8i16, 8*20 }, - { ISD::UDIV, MVT::v4i32, 4*20 }, - { ISD::UDIV, MVT::v2i64, 2*20 }, }; if (ST->hasSSE2()) @@ -733,6 +786,20 @@ int X86TTIImpl::getArithmeticInstrCost( if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) return LT.first * Entry->Cost; + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular + // registers. The overhead of division is going to dominate most kernels + // anyways so try hard to prevent vectorization of division - it is + // generally a bad idea. Assume somewhat arbitrarily that we have to be able + // to hide "20 cycles" for each lane. + if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || + ISD == ISD::UDIV || ISD == ISD::UREM)) { + int ScalarCost = getArithmeticInstrCost( + Opcode, Ty->getScalarType(), Op1Info, Op2Info, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; + } + // Fallback to the default implementation. return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } @@ -754,7 +821,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // type remains the same. if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { MVT LegalVT = LT.second; - if (LegalVT.getVectorElementType().getSizeInBits() == + if (LegalVT.isVector() && + LegalVT.getVectorElementType().getSizeInBits() == Tp->getVectorElementType()->getPrimitiveSizeInBits() && LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { @@ -886,8 +954,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb - { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw - { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb + { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb + { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps @@ -951,15 +1019,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb // + vinsertf128 - { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd - { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd - { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps - { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps - { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor - { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor + { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd + { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd + { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps + { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps + { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor + { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor - { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd - { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb @@ -967,9 +1035,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 - { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd + { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps - { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 @@ -982,12 +1050,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSE41ShuffleTbl[] = { - { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps - { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb + { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw + { TTI::SK_Select, MVT::v2f64, 1 }, // movsd + { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw + { TTI::SK_Select, MVT::v4f32, 1 }, // blendps + { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw + { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb }; if (ST->hasSSE41()) @@ -1001,8 +1069,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb - { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por + { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por + { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb @@ -1029,11 +1097,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus - { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps - { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por - { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por + { TTI::SK_Select, MVT::v2i64, 1 }, // movsd + { TTI::SK_Select, MVT::v2f64, 1 }, // movsd + { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps + { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por + { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd @@ -1057,7 +1125,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, static const CostTblEntry SSE1ShuffleTbl[] = { { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps - { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps + { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps }; @@ -1487,6 +1555,15 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SETCC, MVT::v16f32, 1 }, }; + static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::SETCC, MVT::v32i16, 1 }, + { ISD::SETCC, MVT::v64i8, 1 }, + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -1631,6 +1708,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ }; + static const CostTblEntry GLMCostTbl[] = { + { ISD::FSQRT, MVT::f32, 19 }, // sqrtss + { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps + { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd + { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd + }; + static const CostTblEntry SLMCostTbl[] = { + { ISD::FSQRT, MVT::f32, 20 }, // sqrtss + { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps + { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd + { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd + }; static const CostTblEntry SSE42CostTbl[] = { { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ @@ -1721,6 +1810,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, MVT MTy = LT.second; // Attempt to lookup cost. + if (ST->isGLM()) + if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -1886,8 +1983,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); + Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -2145,7 +2242,7 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); } -/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. int X86TTIImpl::getIntImmCost(int64_t Val) { @@ -2480,6 +2577,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } +bool X86TTIImpl::canMacroFuseCmp() { + return ST->hasMacroFusion(); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { // The backend can't handle a single element vector. if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) @@ -2522,7 +2623,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only // enable gather with a -march. return (DataWidth == 32 || DataWidth == 64) && - (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); + (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { |