diff options
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 225 |
1 files changed, 183 insertions, 42 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 3dc59aeb263e..70fd857fcf01 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { +unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector && !ST->hasSSE1()) return 0; @@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost( int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) return SubLT.first; + // Handle some cases for widening legalization. For now we only handle + // cases where the original subvector was naturally aligned and evenly + // fit in its legalized subvector type. + // FIXME: Remove some of the alignment restrictions. + // FIXME: We can use permq for 64-bit or larger extracts from 256-bit + // vectors. + int OrigSubElts = SubTp->getVectorNumElements(); + if (NumSubElts > OrigSubElts && + (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && + LT.second.getVectorElementType() == + SubLT.second.getVectorElementType() && + LT.second.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits()) { + assert(NumElts >= NumSubElts && NumElts > OrigSubElts && + "Unexpected number of elements!"); + Type *VecTy = VectorType::get(Tp->getVectorElementType(), + LT.second.getVectorNumElements()); + Type *SubTy = VectorType::get(Tp->getVectorElementType(), + SubLT.second.getVectorNumElements()); + int ExtractIndex = alignDown((Index % NumElts), NumSubElts); + int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, + ExtractIndex, SubTy); + + // If the original size is 32-bits or more, we can use pshufd. Otherwise + // if we have SSSE3 we can use pshufb. + if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) + return ExtractCost + 1; // pshufd or pshufb + + assert(SubTp->getPrimitiveSizeInBits() == 16 && + "Unexpected vector size"); + + return ExtractCost + 2; // worst case pshufhw + pshufd + } } } @@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, @@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, @@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, @@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, }; @@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, @@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, @@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW + { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD }; std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); @@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, } } + static const CostTblEntry SLMCostTbl[] = { + // slm pcmpeq/pcmpgt throughput is 2 + { ISD::SETCC, MVT::v2i64, 2 }, + }; + static const CostTblEntry AVX512BWCostTbl[] = { { ISD::SETCC, MVT::v32i16, 1 }, { ISD::SETCC, MVT::v64i8, 1 }, @@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps }; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); + if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) return LT.first * (ExtraCost + Entry->Cost); @@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; + static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets + { ISD::CTLZ, MVT::i64, 1 }, + }; + static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTLZ, MVT::i32, 1 }, + { ISD::CTLZ, MVT::i16, 1 }, + { ISD::CTLZ, MVT::i8, 1 }, + }; + static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets + { ISD::CTPOP, MVT::i64, 1 }, + }; + static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTPOP, MVT::i32, 1 }, + { ISD::CTPOP, MVT::i16, 1 }, + { ISD::CTPOP, MVT::i8, 1 }, + }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, }; @@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i32, 8 }, + { ISD::CTPOP, MVT::i16, 9 }, + { ISD::CTPOP, MVT::i8, 7 }, { ISD::SADDO, MVT::i32, 1 }, { ISD::SADDO, MVT::i16, 1 }, { ISD::SADDO, MVT::i8, 1 }, @@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasLZCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + if (ST->hasPOPCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + // TODO - add BMI (TZCNT) scalar handling + if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || - (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { + if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) || + (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) || + !isPowerOf2_32(NumElem)) { // Scalarization int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); int ScalarCompareCost = getCmpSelInstrCost( @@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - - MVT MTy = LT.second; - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SSE42CostTblPairWise[] = { + static const CostTblEntry SSE2CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 + { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 { ISD::ADD, MVT::v8i16, 5 }, + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblPairWise[] = { - { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". - { ISD::ADD, MVT::v8i16, 5 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 6 }, + { ISD::ADD, MVT::v32i8, 4 }, }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE2CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblNoPairWise[] = { - { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, + { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v8f32, 4 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". { ISD::ADD, MVT::v4i64, 3 }, - { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 5 }, + { ISD::ADD, MVT::v32i8, 4 }, }; + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Before legalizing the type, give a chance to look up illegal narrow types + // in the table. + // FIXME: Is there a better way to do this? + EVT VT = TLI->getValueType(DL, ValTy); + if (VT.isSimple()) { + MVT MTy = VT.getSimpleVT(); + if (IsPairwise) { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) + return Entry->Cost; + } else { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + } + } + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + if (IsPairwise) { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; } else { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; } @@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() { return ST->hasMacroFusion() || ST->hasBranchFusion(); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { if (!ST->hasAVX()) return false; @@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { - return isLegalMaskedLoad(DataType); +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + return isLegalMaskedLoad(DataType, Alignment); } -bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // The only supported nontemporal loads are for aligned vectors of 16 or 32 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 @@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { return false; } -bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // SSE4A supports nontemporal stores of float and double at arbitrary @@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { if (IsZeroCmp) { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). - // TODO: enable AVX512 when the DAG is ready. - // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer |