diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 290 |
1 files changed, 245 insertions, 45 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 5b95c10332dc..b36f8a3d06d0 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, int Index, - VectorType *SubTp) { + VectorType *SubTp, + ArrayRef<const Value *> Args) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); @@ -1223,6 +1224,63 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), LegalVT.getVectorNumElements()); + if (!Mask.empty() && NumOfDests.isValid()) { + // Try to perform better estimation of the permutation. + // 1. Split the source/destination vectors into real registers. + // 2. Do the mask analysis to identify which real registers are + // permuted. If more than 1 source registers are used for the + // destination register building, the cost for this destination register + // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one + // source register is used, build mask and calculate the cost as a cost + // of PermuteSingleSrc. + // Also, for the single register permute we try to identify if the + // destination register is just a copy of the source register or the + // copy of the previous destination register (the cost is + // TTI::TCC_Basic). If the source register is just reused, the cost for + // this operation is 0. + unsigned E = *NumOfDests.getValue(); + unsigned NormalizedVF = + LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); + unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); + unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); + SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); + copy(Mask, NormalizedMask.begin()); + unsigned PrevSrcReg = 0; + ArrayRef<int> PrevRegMask; + InstructionCost Cost = 0; + processShuffleMasks( + NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, + [this, SingleOpTy, &PrevSrcReg, &PrevRegMask, + &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { + if (!ShuffleVectorInst::isIdentityMask(RegMask)) { + // Check if the previous register can be just copied to the next + // one. + if (PrevRegMask.empty() || PrevSrcReg != SrcReg || + PrevRegMask != RegMask) + Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, + RegMask, 0, nullptr); + else + // Just a copy of previous destination register. + Cost += TTI::TCC_Basic; + return; + } + if (SrcReg != DestReg && + any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { + // Just a copy of the source register. + Cost += TTI::TCC_Basic; + } + PrevSrcReg = SrcReg; + PrevRegMask = RegMask; + }, + [this, SingleOpTy, &Cost](ArrayRef<int> RegMask, + unsigned /*Unused*/, + unsigned /*Unused*/) { + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, + 0, nullptr); + }); + return Cost; + } + InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, None, 0, nullptr); @@ -1545,9 +1603,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; - if (ST->hasSSE2()) + static const CostTblEntry SSE3BroadcastLoadTbl[] = { + {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup + }; + + if (ST->hasSSE2()) { + bool IsLoad = + llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); + if (ST->hasSSE3() && IsLoad) + if (const auto *Entry = + CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { + assert(isLegalBroadcastLoad(BaseTp->getElementType(), + LT.second.getVectorElementCount()) && + "Table entry missing from isLegalBroadcastLoad()"); + return LT.first * Entry->Cost; + } + if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; + } static const CostTblEntry SSE1ShuffleTbl[] = { { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps @@ -2444,6 +2518,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, std::pair<InstructionCost, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); + // If we're truncating to the same legalized type - just assume its free. + if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) + return TTI::TCC_Free; + if (ST->useAVX512Regs()) { if (ST->hasBWI()) if (const auto *Entry = ConvertCostTableLookup( @@ -2545,7 +2623,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - unsigned ExtraCost = 0; + InstructionCost ExtraCost = 0; if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { // Some vector comparison predicates cost extra instructions. // TODO: Should we invert this and assume worst case cmp costs @@ -2619,15 +2697,29 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16f32, 1 }, { ISD::SELECT, MVT::v8i64, 1 }, + { ISD::SELECT, MVT::v4i64, 1 }, + { ISD::SELECT, MVT::v2i64, 1 }, { ISD::SELECT, MVT::v16i32, 1 }, + { ISD::SELECT, MVT::v8i32, 1 }, + { ISD::SELECT, MVT::v4i32, 1 }, { ISD::SELECT, MVT::v8f64, 1 }, + { ISD::SELECT, MVT::v4f64, 1 }, + { ISD::SELECT, MVT::v2f64, 1 }, + { ISD::SELECT, MVT::f64, 1 }, { ISD::SELECT, MVT::v16f32, 1 }, + { ISD::SELECT, MVT::v8f32 , 1 }, + { ISD::SELECT, MVT::v4f32, 1 }, + { ISD::SELECT, MVT::f32 , 1 }, { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 - { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 - { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 + { ISD::SELECT, MVT::v32i16, 2 }, + { ISD::SELECT, MVT::v16i16, 1 }, + { ISD::SELECT, MVT::v8i16, 1 }, + { ISD::SELECT, MVT::v64i8, 2 }, + { ISD::SELECT, MVT::v32i8, 1 }, + { ISD::SELECT, MVT::v16i8, 1 }, }; static const CostTblEntry AVX2CostTbl[] = { @@ -2636,10 +2728,12 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i16, 1 }, { ISD::SETCC, MVT::v32i8, 1 }, - { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb - { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb - { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb - { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb + { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb + { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb + { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb + { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb }; static const CostTblEntry AVX1CostTbl[] = { @@ -2651,49 +2745,54 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i16, 4 }, { ISD::SETCC, MVT::v32i8, 4 }, - { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd - { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps - { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd - { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps + { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd + { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps }; static const CostTblEntry SSE42CostTbl[] = { - { ISD::SETCC, MVT::v2f64, 1 }, - { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, }; static const CostTblEntry SSE41CostTbl[] = { - { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd - { ISD::SELECT, MVT::v4f32, 1 }, // blendvps - { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb - { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb - { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb - { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + + { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd + { ISD::SELECT, MVT::f64, 2 }, // blendvpd + { ISD::SELECT, MVT::v4f32, 2 }, // blendvps + { ISD::SELECT, MVT::f32 , 2 }, // blendvps + { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb + { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb + { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb + { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb }; static const CostTblEntry SSE2CostTbl[] = { { ISD::SETCC, MVT::v2f64, 2 }, { ISD::SETCC, MVT::f64, 1 }, - { ISD::SETCC, MVT::v2i64, 8 }, + { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion { ISD::SETCC, MVT::v4i32, 1 }, { ISD::SETCC, MVT::v8i16, 1 }, { ISD::SETCC, MVT::v16i8, 1 }, - { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd - { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por }; static const CostTblEntry SSE1CostTbl[] = { { ISD::SETCC, MVT::v4f32, 2 }, { ISD::SETCC, MVT::f32, 1 }, - { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps + { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps + { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps }; if (ST->useSLMArithCosts()) @@ -3555,7 +3654,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); - int RegisterFileMoveCost = 0; + InstructionCost RegisterFileMoveCost = 0; // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. @@ -3589,6 +3688,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Index != -1U && (Opcode == Instruction::ExtractElement || Opcode == Instruction::InsertElement)) { + // Extraction of vXi1 elements are now efficiently handled by MOVMSK. + if (Opcode == Instruction::ExtractElement && + ScalarType->getScalarSizeInBits() == 1 && + cast<FixedVectorType>(Val)->getNumElements() > 1) + return 1; + // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); @@ -3597,15 +3702,16 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return 0; // The type may be split. Normalize the index to the new type. + unsigned SizeInBits = LT.second.getSizeInBits(); unsigned NumElts = LT.second.getVectorNumElements(); unsigned SubNumElts = NumElts; Index = Index % NumElts; // For >128-bit vectors, we need to extract higher 128-bit subvectors. // For inserts, we also need to insert the subvector back. - if (LT.second.getSizeInBits() > 128) { - assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); - unsigned NumSubVecs = LT.second.getSizeInBits() / 128; + if (SizeInBits > 128) { + assert((SizeInBits % 128) == 0 && "Illegal vector"); + unsigned NumSubVecs = SizeInBits / 128; SubNumElts = NumElts / NumSubVecs; if (SubNumElts <= Index) { RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); @@ -3673,20 +3779,25 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { + assert(DemandedElts.getBitWidth() == + cast<FixedVectorType>(Ty)->getNumElements() && + "Vector size mismatch"); + + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + MVT MScalarTy = LT.second.getScalarType(); + unsigned SizeInBits = LT.second.getSizeInBits(); + InstructionCost Cost = 0; // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. if (Insert) { - std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - MVT MScalarTy = LT.second.getScalarType(); - if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || (MScalarTy.isInteger() && ST->hasSSE41()) || (MScalarTy == MVT::f32 && ST->hasSSE41())) { // For types we can insert directly, insertion into 128-bit sub vectors is // cheap, followed by a cheap chain of concatenations. - if (LT.second.getSizeInBits() <= 128) { + if (SizeInBits <= 128) { Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); } else { @@ -3704,9 +3815,9 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. const int CostValue = *LT.first.getValue(); assert(CostValue >= 0 && "Negative cost!"); - unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; + unsigned Num128Lanes = SizeInBits / 128 * CostValue; unsigned NumElts = LT.second.getVectorNumElements() * CostValue; - APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); + APInt WidenedDemandedElts = DemandedElts.zext(NumElts); unsigned Scale = NumElts / Num128Lanes; // We iterate each 128-lane, and check if we need a // extracti128/inserti128 for this 128-lane. @@ -3747,10 +3858,59 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, } } - // TODO: Use default extraction for now, but we should investigate extending this - // to handle repeated subvector extraction. - if (Extract) + if (Extract) { + // vXi1 can be efficiently extracted with MOVMSK. + // TODO: AVX512 predicate mask handling. + // NOTE: This doesn't work well for roundtrip scalarization. + if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { + unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); + unsigned MaxElts = ST->hasAVX2() ? 32 : 16; + unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; + return MOVMSKCost; + } + + if (LT.second.isVector()) { + int CostValue = *LT.first.getValue(); + assert(CostValue >= 0 && "Negative cost!"); + + unsigned NumElts = LT.second.getVectorNumElements() * CostValue; + assert(NumElts >= DemandedElts.getBitWidth() && + "Vector has been legalized to smaller element count"); + + // If we're extracting elements from a 128-bit subvector lane, we only need + // to extract each lane once, not for every element. + if (SizeInBits > 128) { + assert((SizeInBits % 128) == 0 && "Illegal vector"); + unsigned NumLegal128Lanes = SizeInBits / 128; + unsigned Num128Lanes = NumLegal128Lanes * CostValue; + APInt WidenedDemandedElts = DemandedElts.zext(NumElts); + unsigned Scale = NumElts / Num128Lanes; + + // Add cost for each demanded 128-bit subvector extraction. + // Luckily this is a lot easier than for insertion. + APInt DemandedUpper128Lanes = + APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes); + auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale); + for (unsigned I = 0; I != Num128Lanes; ++I) + if (DemandedUpper128Lanes[I]) + Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + I * Scale, Ty128); + + // Add all the demanded element extractions together, but adjust the + // index to use the equivalent of the bottom 128 bit lane. + for (unsigned I = 0; I != NumElts; ++I) + if (WidenedDemandedElts[I]) { + unsigned Idx = I % Scale; + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx); + } + + return Cost; + } + } + + // Fallback to default extraction. Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + } return Cost; } @@ -3855,8 +4015,7 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, // if all elements that will form a single Dst vector aren't demanded, // then we won't need to do that shuffle, so adjust the cost accordingly. APInt DemandedDstVectors = APIntOps::ScaleBitMask( - DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), - NumDstVectors); + DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); InstructionCost SingleShuffleCost = @@ -5029,8 +5188,8 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } -bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, - TargetTransformInfo::LSRCost &C2) { +bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) { // X86 specific here are "instruction number 1st priority". return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, @@ -5110,6 +5269,14 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { return true; } +bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, + ElementCount NumElements) const { + // movddup + return ST->hasSSE3() && !NumElements.isScalable() && + NumElements.getFixedValue() == 2 && + ElementTy == Type::getDoubleTy(ElementTy->getContext()); +} + bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { if (!isa<VectorType>(DataTy)) return false; @@ -5174,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { return IntWidth == 32 || IntWidth == 64; } +bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, + const SmallBitVector &OpcodeMask) const { + // ADDSUBPS 4xf32 SSE3 + // VADDSUBPS 4xf32 AVX + // VADDSUBPS 8xf32 AVX2 + // ADDSUBPD 2xf64 SSE3 + // VADDSUBPD 2xf64 AVX + // VADDSUBPD 4xf64 AVX2 + + unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); + assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible"); + if (!isPowerOf2_32(NumElements)) + return false; + // Check the opcode pattern. We apply the mask on the opcode arguments and + // then check if it is what we expect. + for (int Lane : seq<int>(0, NumElements)) { + unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; + // We expect FSub for even lanes and FAdd for odd lanes. + if (Lane % 2 == 0 && Opc != Instruction::FSub) + return false; + if (Lane % 2 == 1 && Opc != Instruction::FAdd) + return false; + } + // Now check that the pattern is supported by the target ISA. + Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); + if (ElemTy->isFloatTy()) + return ST->hasSSE3() && NumElements % 4 == 0; + if (ElemTy->isDoubleTy()) + return ST->hasSSE3() && NumElements % 2 == 0; + return false; +} + bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { // AVX2 doesn't support scatter if (!ST->hasAVX512()) |
