aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp290
1 files changed, 245 insertions, 45 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5b95c10332dc..b36f8a3d06d0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
@@ -1223,6 +1224,63 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
+ if (!Mask.empty() && NumOfDests.isValid()) {
+ // Try to perform better estimation of the permutation.
+ // 1. Split the source/destination vectors into real registers.
+ // 2. Do the mask analysis to identify which real registers are
+ // permuted. If more than 1 source registers are used for the
+ // destination register building, the cost for this destination register
+ // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
+ // source register is used, build mask and calculate the cost as a cost
+ // of PermuteSingleSrc.
+ // Also, for the single register permute we try to identify if the
+ // destination register is just a copy of the source register or the
+ // copy of the previous destination register (the cost is
+ // TTI::TCC_Basic). If the source register is just reused, the cost for
+ // this operation is 0.
+ unsigned E = *NumOfDests.getValue();
+ unsigned NormalizedVF =
+ LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
+ unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
+ copy(Mask, NormalizedMask.begin());
+ unsigned PrevSrcReg = 0;
+ ArrayRef<int> PrevRegMask;
+ InstructionCost Cost = 0;
+ processShuffleMasks(
+ NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
+ [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
+ &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
+ if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
+ // Check if the previous register can be just copied to the next
+ // one.
+ if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
+ PrevRegMask != RegMask)
+ Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+ RegMask, 0, nullptr);
+ else
+ // Just a copy of previous destination register.
+ Cost += TTI::TCC_Basic;
+ return;
+ }
+ if (SrcReg != DestReg &&
+ any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
+ // Just a copy of the source register.
+ Cost += TTI::TCC_Basic;
+ }
+ PrevSrcReg = SrcReg;
+ PrevRegMask = RegMask;
+ },
+ [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
+ unsigned /*Unused*/,
+ unsigned /*Unused*/) {
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
+ 0, nullptr);
+ });
+ return Cost;
+ }
+
InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
None, 0, nullptr);
@@ -1545,9 +1603,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
- if (ST->hasSSE2())
+ static const CostTblEntry SSE3BroadcastLoadTbl[] = {
+ {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
+ };
+
+ if (ST->hasSSE2()) {
+ bool IsLoad =
+ llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
+ if (ST->hasSSE3() && IsLoad)
+ if (const auto *Entry =
+ CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
+ assert(isLegalBroadcastLoad(BaseTp->getElementType(),
+ LT.second.getVectorElementCount()) &&
+ "Table entry missing from isLegalBroadcastLoad()");
+ return LT.first * Entry->Cost;
+ }
+
if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
+ }
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
@@ -2444,6 +2518,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
std::pair<InstructionCost, MVT> LTDest =
TLI->getTypeLegalizationCost(DL, Dst);
+ // If we're truncating to the same legalized type - just assume its free.
+ if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
+ return TTI::TCC_Free;
+
if (ST->useAVX512Regs()) {
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(
@@ -2545,7 +2623,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- unsigned ExtraCost = 0;
+ InstructionCost ExtraCost = 0;
if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
// Some vector comparison predicates cost extra instructions.
// TODO: Should we invert this and assume worst case cmp costs
@@ -2619,15 +2697,29 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16f32, 1 },
{ ISD::SELECT, MVT::v8i64, 1 },
+ { ISD::SELECT, MVT::v4i64, 1 },
+ { ISD::SELECT, MVT::v2i64, 1 },
{ ISD::SELECT, MVT::v16i32, 1 },
+ { ISD::SELECT, MVT::v8i32, 1 },
+ { ISD::SELECT, MVT::v4i32, 1 },
{ ISD::SELECT, MVT::v8f64, 1 },
+ { ISD::SELECT, MVT::v4f64, 1 },
+ { ISD::SELECT, MVT::v2f64, 1 },
+ { ISD::SELECT, MVT::f64, 1 },
{ ISD::SELECT, MVT::v16f32, 1 },
+ { ISD::SELECT, MVT::v8f32 , 1 },
+ { ISD::SELECT, MVT::v4f32, 1 },
+ { ISD::SELECT, MVT::f32 , 1 },
{ ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
{ ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
- { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
- { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v32i16, 2 },
+ { ISD::SELECT, MVT::v16i16, 1 },
+ { ISD::SELECT, MVT::v8i16, 1 },
+ { ISD::SELECT, MVT::v64i8, 2 },
+ { ISD::SELECT, MVT::v32i8, 1 },
+ { ISD::SELECT, MVT::v16i8, 1 },
};
static const CostTblEntry AVX2CostTbl[] = {
@@ -2636,10 +2728,12 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i16, 1 },
{ ISD::SETCC, MVT::v32i8, 1 },
- { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
- { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
- { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
- { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb
};
static const CostTblEntry AVX1CostTbl[] = {
@@ -2651,49 +2745,54 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i16, 4 },
{ ISD::SETCC, MVT::v32i8, 4 },
- { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
- { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
- { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
- { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd
+ { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps
{ ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
{ ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
};
static const CostTblEntry SSE42CostTbl[] = {
- { ISD::SETCC, MVT::v2f64, 1 },
- { ISD::SETCC, MVT::v4f32, 1 },
{ ISD::SETCC, MVT::v2i64, 1 },
};
static const CostTblEntry SSE41CostTbl[] = {
- { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
- { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
- { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
- { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
- { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
- { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+
+ { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd
+ { ISD::SELECT, MVT::f64, 2 }, // blendvpd
+ { ISD::SELECT, MVT::v4f32, 2 }, // blendvps
+ { ISD::SELECT, MVT::f32 , 2 }, // blendvps
+ { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb
};
static const CostTblEntry SSE2CostTbl[] = {
{ ISD::SETCC, MVT::v2f64, 2 },
{ ISD::SETCC, MVT::f64, 1 },
- { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion
{ ISD::SETCC, MVT::v4i32, 1 },
{ ISD::SETCC, MVT::v8i16, 1 },
{ ISD::SETCC, MVT::v16i8, 1 },
- { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
- { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por
};
static const CostTblEntry SSE1CostTbl[] = {
{ ISD::SETCC, MVT::v4f32, 2 },
{ ISD::SETCC, MVT::f32, 1 },
- { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
+ { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
+ { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
};
if (ST->useSLMArithCosts())
@@ -3555,7 +3654,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
assert(Val->isVectorTy() && "This must be a vector type");
Type *ScalarType = Val->getScalarType();
- int RegisterFileMoveCost = 0;
+ InstructionCost RegisterFileMoveCost = 0;
// Non-immediate extraction/insertion can be handled as a sequence of
// aliased loads+stores via the stack.
@@ -3589,6 +3688,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
if (Index != -1U && (Opcode == Instruction::ExtractElement ||
Opcode == Instruction::InsertElement)) {
+ // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
+ if (Opcode == Instruction::ExtractElement &&
+ ScalarType->getScalarSizeInBits() == 1 &&
+ cast<FixedVectorType>(Val)->getNumElements() > 1)
+ return 1;
+
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -3597,15 +3702,16 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return 0;
// The type may be split. Normalize the index to the new type.
+ unsigned SizeInBits = LT.second.getSizeInBits();
unsigned NumElts = LT.second.getVectorNumElements();
unsigned SubNumElts = NumElts;
Index = Index % NumElts;
// For >128-bit vectors, we need to extract higher 128-bit subvectors.
// For inserts, we also need to insert the subvector back.
- if (LT.second.getSizeInBits() > 128) {
- assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
- unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ if (SizeInBits > 128) {
+ assert((SizeInBits % 128) == 0 && "Illegal vector");
+ unsigned NumSubVecs = SizeInBits / 128;
SubNumElts = NumElts / NumSubVecs;
if (SubNumElts <= Index) {
RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
@@ -3673,20 +3779,25 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
const APInt &DemandedElts,
bool Insert,
bool Extract) {
+ assert(DemandedElts.getBitWidth() ==
+ cast<FixedVectorType>(Ty)->getNumElements() &&
+ "Vector size mismatch");
+
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ MVT MScalarTy = LT.second.getScalarType();
+ unsigned SizeInBits = LT.second.getSizeInBits();
+
InstructionCost Cost = 0;
// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
- std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- MVT MScalarTy = LT.second.getScalarType();
-
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
(MScalarTy.isInteger() && ST->hasSSE41()) ||
(MScalarTy == MVT::f32 && ST->hasSSE41())) {
// For types we can insert directly, insertion into 128-bit sub vectors is
// cheap, followed by a cheap chain of concatenations.
- if (LT.second.getSizeInBits() <= 128) {
+ if (SizeInBits <= 128) {
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
@@ -3704,9 +3815,9 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
// Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
const int CostValue = *LT.first.getValue();
assert(CostValue >= 0 && "Negative cost!");
- unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
+ unsigned Num128Lanes = SizeInBits / 128 * CostValue;
unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
- APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+ APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
unsigned Scale = NumElts / Num128Lanes;
// We iterate each 128-lane, and check if we need a
// extracti128/inserti128 for this 128-lane.
@@ -3747,10 +3858,59 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
}
}
- // TODO: Use default extraction for now, but we should investigate extending this
- // to handle repeated subvector extraction.
- if (Extract)
+ if (Extract) {
+ // vXi1 can be efficiently extracted with MOVMSK.
+ // TODO: AVX512 predicate mask handling.
+ // NOTE: This doesn't work well for roundtrip scalarization.
+ if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
+ unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
+ return MOVMSKCost;
+ }
+
+ if (LT.second.isVector()) {
+ int CostValue = *LT.first.getValue();
+ assert(CostValue >= 0 && "Negative cost!");
+
+ unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
+ assert(NumElts >= DemandedElts.getBitWidth() &&
+ "Vector has been legalized to smaller element count");
+
+ // If we're extracting elements from a 128-bit subvector lane, we only need
+ // to extract each lane once, not for every element.
+ if (SizeInBits > 128) {
+ assert((SizeInBits % 128) == 0 && "Illegal vector");
+ unsigned NumLegal128Lanes = SizeInBits / 128;
+ unsigned Num128Lanes = NumLegal128Lanes * CostValue;
+ APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
+ unsigned Scale = NumElts / Num128Lanes;
+
+ // Add cost for each demanded 128-bit subvector extraction.
+ // Luckily this is a lot easier than for insertion.
+ APInt DemandedUpper128Lanes =
+ APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes);
+ auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale);
+ for (unsigned I = 0; I != Num128Lanes; ++I)
+ if (DemandedUpper128Lanes[I])
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
+ I * Scale, Ty128);
+
+ // Add all the demanded element extractions together, but adjust the
+ // index to use the equivalent of the bottom 128 bit lane.
+ for (unsigned I = 0; I != NumElts; ++I)
+ if (WidenedDemandedElts[I]) {
+ unsigned Idx = I % Scale;
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
+ }
+
+ return Cost;
+ }
+ }
+
+ // Fallback to default extraction.
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+ }
return Cost;
}
@@ -3855,8 +4015,7 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
// if all elements that will form a single Dst vector aren't demanded,
// then we won't need to do that shuffle, so adjust the cost accordingly.
APInt DemandedDstVectors = APIntOps::ScaleBitMask(
- DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
- NumDstVectors);
+ DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
InstructionCost SingleShuffleCost =
@@ -5029,8 +5188,8 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2) {
+bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) {
// X86 specific here are "instruction number 1st priority".
return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
C1.NumIVMuls, C1.NumBaseAdds,
@@ -5110,6 +5269,14 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
return true;
}
+bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
+ ElementCount NumElements) const {
+ // movddup
+ return ST->hasSSE3() && !NumElements.isScalable() &&
+ NumElements.getFixedValue() == 2 &&
+ ElementTy == Type::getDoubleTy(ElementTy->getContext());
+}
+
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
if (!isa<VectorType>(DataTy))
return false;
@@ -5174,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
return IntWidth == 32 || IntWidth == 64;
}
+bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
+ unsigned Opcode1,
+ const SmallBitVector &OpcodeMask) const {
+ // ADDSUBPS 4xf32 SSE3
+ // VADDSUBPS 4xf32 AVX
+ // VADDSUBPS 8xf32 AVX2
+ // ADDSUBPD 2xf64 SSE3
+ // VADDSUBPD 2xf64 AVX
+ // VADDSUBPD 4xf64 AVX2
+
+ unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+ assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
+ if (!isPowerOf2_32(NumElements))
+ return false;
+ // Check the opcode pattern. We apply the mask on the opcode arguments and
+ // then check if it is what we expect.
+ for (int Lane : seq<int>(0, NumElements)) {
+ unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
+ // We expect FSub for even lanes and FAdd for odd lanes.
+ if (Lane % 2 == 0 && Opc != Instruction::FSub)
+ return false;
+ if (Lane % 2 == 1 && Opc != Instruction::FAdd)
+ return false;
+ }
+ // Now check that the pattern is supported by the target ISA.
+ Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
+ if (ElemTy->isFloatTy())
+ return ST->hasSSE3() && NumElements % 4 == 0;
+ if (ElemTy->isDoubleTy())
+ return ST->hasSSE3() && NumElements % 2 == 0;
+ return false;
+}
+
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
// AVX2 doesn't support scatter
if (!ST->hasAVX512())