aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-24 15:03:44 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-07-24 15:03:44 +0000
commit4b4fe385e49bd883fd183b5f21c1ea486c722e61 (patch)
treec3d8fdb355c9c73e57723718c22103aaf7d15aa6 /llvm/lib/Target/X86
parent1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff)
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/X86.td2
-rw-r--r--llvm/lib/Target/X86/X86FixupBWInsts.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp561
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td48
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp12
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td70
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp8
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp65
9 files changed, 497 insertions, 284 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a859176220c7..fa0a6bd415dc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1277,7 +1277,7 @@ class ProcModel<string Name, SchedMachineModel Model,
// enabled. It has no effect on code generation.
// NOTE: As a default tuning, "generic" aims to produce code optimized for the
// most common X86 processors. The tunings might be changed over time. It is
-// recommended to use "x86-64" in lit tests for consistency.
+// recommended to use "tune-cpu"="x86-64" in function attribute for consistency.
def : ProcModel<"generic", SandyBridgeModel,
[FeatureX87, FeatureCX8, FeatureX86_64],
[TuningSlow3OpsLEA,
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 16bff201dd03..db6923416177 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
switch (MI->getOpcode()) {
case X86::MOV8rm:
- // Only replace 8 bit loads with the zero extending versions if
- // in an inner most loop and not optimizing for size. This takes
- // an extra byte to encode, and provides limited performance upside.
- if (MachineLoop *ML = MLI->getLoopFor(&MBB))
- if (ML->begin() == ML->end() && !OptForSize)
- return tryReplaceLoad(X86::MOVZX32rm8, MI);
+ // Replace 8-bit loads with the zero-extending version if not optimizing
+ // for size. The extending op is cheaper across a wide range of uarch and
+ // it avoids a potentially expensive partial register stall. It takes an
+ // extra byte to encode, however, so don't do this when optimizing for size.
+ if (!OptForSize)
+ return tryReplaceLoad(X86::MOVZX32rm8, MI);
break;
case X86::MOV16rm:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12af6087cb47..5a4533c4bac4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -555,6 +555,39 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+ auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+ setOperationAction(ISD::FABS, VT, Action);
+ setOperationAction(ISD::FNEG, VT, Action);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Action);
+ setOperationAction(ISD::FMA, VT, Action);
+ setOperationAction(ISD::FMINNUM, VT, Action);
+ setOperationAction(ISD::FMAXNUM, VT, Action);
+ setOperationAction(ISD::FMINIMUM, VT, Action);
+ setOperationAction(ISD::FMAXIMUM, VT, Action);
+ setOperationAction(ISD::FSIN, VT, Action);
+ setOperationAction(ISD::FCOS, VT, Action);
+ setOperationAction(ISD::FSINCOS, VT, Action);
+ setOperationAction(ISD::FSQRT, VT, Action);
+ setOperationAction(ISD::FPOW, VT, Action);
+ setOperationAction(ISD::FLOG, VT, Action);
+ setOperationAction(ISD::FLOG2, VT, Action);
+ setOperationAction(ISD::FLOG10, VT, Action);
+ setOperationAction(ISD::FEXP, VT, Action);
+ setOperationAction(ISD::FEXP2, VT, Action);
+ setOperationAction(ISD::FCEIL, VT, Action);
+ setOperationAction(ISD::FFLOOR, VT, Action);
+ setOperationAction(ISD::FNEARBYINT, VT, Action);
+ setOperationAction(ISD::FRINT, VT, Action);
+ setOperationAction(ISD::BR_CC, VT, Action);
+ setOperationAction(ISD::SETCC, VT, Action);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Action);
+ setOperationAction(ISD::FROUND, VT, Action);
+ setOperationAction(ISD::FROUNDEVEN, VT, Action);
+ setOperationAction(ISD::FTRUNC, VT, Action);
+ };
+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
// f16, f32 and f64 use SSE.
// Set up the FP register classes.
@@ -592,40 +625,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Half type will be promoted by default.
- setOperationAction(ISD::FABS, MVT::f16, Promote);
- setOperationAction(ISD::FNEG, MVT::f16, Promote);
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+ setF16Action(MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::FMA, MVT::f16, Promote);
- setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSQRT, MVT::f16, Promote);
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::f16, Promote);
- setOperationAction(ISD::FLOG2, MVT::f16, Promote);
- setOperationAction(ISD::FLOG10, MVT::f16, Promote);
- setOperationAction(ISD::FEXP, MVT::f16, Promote);
- setOperationAction(ISD::FEXP2, MVT::f16, Promote);
- setOperationAction(ISD::FCEIL, MVT::f16, Promote);
- setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
- setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
- setOperationAction(ISD::FRINT, MVT::f16, Promote);
- setOperationAction(ISD::BR_CC, MVT::f16, Promote);
- setOperationAction(ISD::SETCC, MVT::f16, Promote);
- setOperationAction(ISD::SELECT, MVT::f16, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::f16, Promote);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
- setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -1003,6 +1007,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR128RegClass);
addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
+ addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
@@ -1084,7 +1090,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
- for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+ for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1095,19 +1101,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
+ setF16Action(MVT::v8f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
// Custom lower v2i64 and v2f64 selects.
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
@@ -1118,8 +1130,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
}
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
@@ -1304,6 +1316,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
+ addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
@@ -1340,12 +1354,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
@@ -1356,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
@@ -1386,6 +1401,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
@@ -1507,7 +1523,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Custom lower several nodes for 256-bit types.
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v8f32, MVT::v4f64 }) {
+ MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1518,6 +1534,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}
+ setF16Action(MVT::v16f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v16f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
if (HasInt256) {
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
@@ -1532,11 +1553,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+ if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
+ Subtarget.hasF16C()) {
+ for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+ }
+ for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
+ }
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+ setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
+ setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+ }
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
}
// This block controls legalization of the mask vector sizes that are
@@ -1619,6 +1652,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
@@ -1645,14 +1679,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
}
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
@@ -1664,7 +1700,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
@@ -1799,15 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
-
+ for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+ ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+ setOperationAction(Opc, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
@@ -1831,7 +1861,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
- MVT::v16f32, MVT::v8f64 }) {
+ MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -1842,6 +1872,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
}
+ setF16Action(MVT::v32f16, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+ for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+ setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+ setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
+ }
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
@@ -1881,23 +1920,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
- Subtarget.hasVLX() ? Legal : Custom);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -1934,25 +1959,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
- for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::SINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::UINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_SINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FP_TO_UINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
- Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::MUL, VT, Legal);
+ for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+ ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
+ setOperationAction(Opc, MVT::v2i64, Custom);
+ setOperationAction(Opc, MVT::v4i64, Custom);
}
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
}
if (Subtarget.hasCDI()) {
@@ -2052,7 +2066,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// AVX512_FP16 scalar operations
setGroup(MVT::f16);
- addRegisterClass(MVT::f16, &X86::FR16XRegClass);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
@@ -2066,6 +2079,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
@@ -2073,14 +2087,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.useAVX512Regs()) {
setGroup(MVT::v32f16);
- addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
@@ -2112,8 +2129,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasVLX()) {
- addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
- addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
setGroup(MVT::v8f16);
setGroup(MVT::v16f16);
@@ -2132,8 +2147,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
@@ -2347,7 +2366,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FP16_TO_FP,
ISD::FP_EXTEND,
ISD::STRICT_FP_EXTEND,
- ISD::FP_ROUND});
+ ISD::FP_ROUND,
+ ISD::STRICT_FP_ROUND});
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -2404,6 +2424,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TypeSplitVector;
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+ return TypeSplitVector;
+
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
@@ -2447,22 +2471,21 @@ handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512()) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (VT.isVector()) {
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
- MVT RegisterVT;
- unsigned NumRegisters;
- std::tie(RegisterVT, NumRegisters) =
- handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
- if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
- return RegisterVT;
- }
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
- // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
- // So its default register type is f16. We override the type to v8f16 here.
- if (VT == MVT::v3f16 && Subtarget.hasFP16())
- return MVT::v8f16;
+ if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+ return MVT::v8f16;
+ }
// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
@@ -2475,22 +2498,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512()) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (VT.isVector()) {
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
- MVT RegisterVT;
- unsigned NumRegisters;
- std::tie(RegisterVT, NumRegisters) =
- handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
- if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
- return NumRegisters;
- }
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
- // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
- // So its default register number is 3. We override the number to 1 here.
- if (VT == MVT::v3f16 && Subtarget.hasFP16())
- return 1;
+ if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+ return 1;
+ }
// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
// x87 is disabled.
@@ -9646,13 +9668,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
- // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 ||
(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
- (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
+ CVT == MVT::f16 ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -14129,6 +14151,16 @@ static bool isShuffleFoldableLoad(SDValue V) {
ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
}
+template<typename T>
+static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
+ return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
+}
+
+template<typename T>
+bool X86TargetLowering::isSoftFP16(T VT) const {
+ return ::isSoftFP16(VT, Subtarget);
+}
+
/// Try to lower insertion of a single element into a zero vector.
///
/// This is a common pattern that we have especially efficient patterns to lower
@@ -14140,6 +14172,9 @@ static SDValue lowerShuffleAsElementInsertion(
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
+ if (isSoftFP16(EltVT, Subtarget))
+ return SDValue();
+
int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
@@ -19444,6 +19479,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ if (isSoftFP16(VT)) {
+ MVT NVT = VT.changeVectorElementTypeToInteger();
+ return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
+ DAG.getBitcast(NVT, LHS),
+ DAG.getBitcast(NVT, RHS)));
+ }
+
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
@@ -19467,8 +19511,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget.hasSSE41())
return SDValue();
- SDLoc dl(Op);
- MVT VT = Op.getSimpleValueType();
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
@@ -20856,16 +20898,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
return Cvt;
}
-template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
- return VT == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
- return ::isSoftFP16(VT, Subtarget);
-}
-
static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
bool IsStrict = Op->isStrictFPOpcode();
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
@@ -20885,6 +20917,26 @@ static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
}
+static bool isLegalConversion(MVT VT, bool IsSigned,
+ const X86Subtarget &Subtarget) {
+ if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
+ return true;
+ if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
+ return true;
+ if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
+ return true;
+ if (Subtarget.useAVX512Regs()) {
+ if (VT == MVT::v16i32)
+ return true;
+ if (VT == MVT::v8i64 && Subtarget.hasDQI())
+ return true;
+ }
+ if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i64))
+ return true;
+ return false;
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20897,6 +20949,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
if (isSoftFP16(VT))
return promoteXINT_TO_FP(Op, DAG);
+ else if (isLegalConversion(SrcVT, true, Subtarget))
+ return Op;
if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
return LowerWin64_INT128_TO_FP(Op, DAG);
@@ -21400,6 +21454,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (isSoftFP16(DstVT))
return promoteXINT_TO_FP(Op, DAG);
+ else if (isLegalConversion(SrcVT, false, Subtarget))
+ return Op;
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -22229,6 +22285,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
{NVT, MVT::Other}, {Chain, Src})});
return DAG.getNode(Op.getOpcode(), dl, VT,
DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
+ } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
+ return Op;
}
if (VT.isVector()) {
@@ -22826,7 +22884,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return Op;
if (SVT.getVectorElementType() == MVT::f16) {
- assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
+ assert(Subtarget.hasF16C() && "Unexpected features!");
if (SVT == MVT::v2f16)
In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
DAG.getUNDEF(MVT::v2f16));
@@ -22836,6 +22894,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
{Op->getOperand(0), Res});
return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+ } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
+ return Op;
}
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -22854,34 +22914,19 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
return SDValue();
- if (VT == MVT::f16) {
- if (Subtarget.hasFP16())
- return Op;
-
- if (SVT != MVT::f32) {
- if (IsStrict)
- return DAG.getNode(
- ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
- {Chain,
- DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
- {Chain, In, Op2}),
- Op2});
-
- return DAG.getNode(ISD::FP_ROUND, DL, VT,
- DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
- Op2);
- }
-
- if (!Subtarget.hasF16C())
+ if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
+ if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
return SDValue();
+ if (VT.isVector())
+ return Op;
+
SDValue Res;
SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
MVT::i32);
@@ -24176,10 +24221,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
if (isFP) {
-#ifndef NDEBUG
MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-#endif
+ if (isSoftFP16(EltVT, Subtarget))
+ return SDValue();
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24741,6 +24786,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC =
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
+ if (isSoftFP16(Op0.getValueType()))
+ return SDValue();
+
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets handled by emitFlagsForSetcc.
if (Op0.getValueType() == MVT::f128) {
@@ -24931,10 +24979,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op1.getSimpleValueType();
SDValue CC;
- if (isSoftFP16(VT))
- return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
- DAG.getBitcast(MVT::i16, Op1),
- DAG.getBitcast(MVT::i16, Op2)));
+ if (isSoftFP16(VT)) {
+ MVT NVT = VT.changeTypeToInteger();
+ return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
+ DAG.getBitcast(NVT, Op1),
+ DAG.getBitcast(NVT, Op2)));
+ }
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
@@ -27268,27 +27318,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
- case Intrinsic::swift_async_context_addr: {
- auto &MF = DAG.getMachineFunction();
- auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
- if (Subtarget.is64Bit()) {
- MF.getFrameInfo().setFrameAddressIsTaken(true);
- X86FI->setHasSwiftAsyncContext(true);
- return SDValue(
- DAG.getMachineNode(
- X86::SUB64ri8, dl, MVT::i64,
- DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
- DAG.getTargetConstant(8, dl, MVT::i32)),
- 0);
- } else {
- // 32-bit so no special extended frame, create or reuse an existing stack
- // slot.
- if (!X86FI->getSwiftAsyncContextFrameIdx())
- X86FI->setSwiftAsyncContextFrameIdx(
- MF.getFrameInfo().CreateStackObject(4, Align(4), false));
- return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
- }
- }
case Intrinsic::x86_avx512_vp2intersect_q_512:
case Intrinsic::x86_avx512_vp2intersect_q_256:
case Intrinsic::x86_avx512_vp2intersect_q_128:
@@ -27668,6 +27697,37 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
+
+ case Intrinsic::swift_async_context_addr: {
+ SDLoc dl(Op);
+ auto &MF = DAG.getMachineFunction();
+ auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ MF.getFrameInfo().setFrameAddressIsTaken(true);
+ X86FI->setHasSwiftAsyncContext(true);
+ SDValue Chain = Op->getOperand(0);
+ SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
+ SDValue Result =
+ SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
+ DAG.getTargetConstant(8, dl, MVT::i32)),
+ 0);
+ // Return { result, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ CopyRBP.getValue(1));
+ } else {
+ // 32-bit so no special extended frame, create or reuse an existing
+ // stack slot.
+ if (!X86FI->getSwiftAsyncContextFrameIdx())
+ X86FI->setSwiftAsyncContextFrameIdx(
+ MF.getFrameInfo().CreateStackObject(4, Align(4), false));
+ SDValue Result =
+ DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
+ // Return { result, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ Op->getOperand(0));
+ }
+ }
+
case llvm::Intrinsic::x86_seh_ehregnode:
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
@@ -32901,20 +32961,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: {
bool IsStrict = N->isStrictFPOpcode();
+ SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
+ EVT SrcVT = Src.getValueType();
EVT VT = N->getValueType(0);
- EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+ SDValue V;
if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
: DAG.getUNDEF(MVT::v2f32);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
}
+ if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
+ assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
+ if (SrcVT.getVectorElementType() != MVT::f32)
+ return;
+
+ if (IsStrict)
+ V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+ {Chain, Src, Rnd});
+ else
+ V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
+
+ Results.push_back(DAG.getBitcast(MVT::v8f16, V));
+ if (IsStrict)
+ Results.push_back(V.getValue(1));
+ return;
+ }
if (!isTypeLegal(Src.getValueType()))
return;
- SDValue V;
+ EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
if (IsStrict)
V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
- {N->getOperand(0), Src});
+ {Chain, Src});
else
V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
Results.push_back(V);
@@ -37342,6 +37421,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool IsUnary) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+ unsigned SizeInBits = MaskVT.getSizeInBits();
if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
@@ -37409,7 +37489,10 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match against a OR if we're performing a blend shuffle and the
// non-blended source element is zero in each case.
- if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
+ if (SizeInBits == V1.getValueSizeInBits() &&
+ SizeInBits == V2.getValueSizeInBits() &&
+ (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
bool IsBlend = true;
unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
@@ -39652,11 +39735,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
- // FIXME: Remove this after we support vector FP16
- if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(),
- Subtarget))
- return SDValue();
-
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
@@ -40947,12 +41025,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
EltBits)) {
OpBits.clearAllBits();
OpElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
- (!Invert && !EltBits[I].isZero()))) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (!DemandedElts[I])
+ continue;
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ OpBits.setAllBits();
+ OpElts.setBit(I);
+ } else if ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero())) {
OpBits |= Invert ? ~EltBits[I] : EltBits[I];
OpElts.setBit(I);
}
+ }
}
return std::make_pair(OpBits, OpElts);
};
@@ -44715,7 +44801,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
// Early exit check
- if (!TLI.isTypeLegal(VT))
+ if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
return SDValue();
if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -47798,11 +47884,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
EltBits)) {
DemandedBits.clearAllBits();
DemandedElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if (!EltBits[I].isZero()) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ DemandedBits.setAllBits();
+ DemandedElts.setBit(I);
+ } else if (!EltBits[I].isZero()) {
DemandedBits |= EltBits[I];
DemandedElts.setBit(I);
}
+ }
}
return std::make_pair(DemandedBits, DemandedElts);
};
@@ -51042,6 +51134,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
MVT VT = N->getSimpleValueType(0);
+ int NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
// ANDNP(undef, x) -> 0
// ANDNP(x, undef) -> 0
@@ -51060,6 +51154,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
if (SDValue Not = IsNOT(N0, DAG))
return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
+ // Constant Folding
+ APInt Undefs0, Undefs1;
+ SmallVector<APInt> EltBits0, EltBits1;
+ if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0) &&
+ getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
+ SDLoc DL(N);
+ SmallVector<APInt> ResultBits;
+ for (int I = 0; I != NumElts; ++I)
+ ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
+ APInt ResultUndefs = APInt::getZero(NumElts);
+ return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
+ }
+
// TODO: Constant fold NOT(N0) to allow us to use AND.
// TODO: Do this in IsNOT with suitable oneuse checks?
@@ -51074,20 +51181,24 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
APInt UndefElts;
SmallVector<APInt> EltBits;
- int NumElts = VT.getVectorNumElements();
- int EltSizeInBits = VT.getScalarSizeInBits();
APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
APInt DemandedElts = APInt::getAllOnes(NumElts);
if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
EltBits)) {
DemandedBits.clearAllBits();
DemandedElts.clearAllBits();
- for (int I = 0; I != NumElts; ++I)
- if ((Invert && !EltBits[I].isAllOnes()) ||
- (!Invert && !EltBits[I].isZero())) {
+ for (int I = 0; I != NumElts; ++I) {
+ if (UndefElts[I]) {
+ // We can't assume an undef src element gives an undef dst - the
+ // other src might be zero.
+ DemandedBits.setAllBits();
+ DemandedElts.setBit(I);
+ } else if ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero())) {
DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
DemandedElts.setBit(I);
}
+ }
}
return std::make_pair(DemandedBits, DemandedElts);
};
@@ -54714,8 +54825,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasFP16())
return SDValue();
+ bool IsStrict = N->isStrictFPOpcode();
EVT VT = N->getValueType(0);
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
@@ -54736,8 +54848,15 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
// Destination is v8i16 with at least 8 elements.
EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
std::max(8U, NumElts));
- SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
- DAG.getTargetConstant(4, dl, MVT::i32));
+ SDValue Cvt, Chain;
+ SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
+ if (IsStrict) {
+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
+ {N->getOperand(0), Src, Rnd});
+ Chain = Cvt.getValue(1);
+ } else {
+ Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
+ }
// Extract down to real number of elements.
if (NumElts < 8) {
@@ -54746,7 +54865,12 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
- return DAG.getBitcast(VT, Cvt);
+ Cvt = DAG.getBitcast(VT, Cvt);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cvt, Chain}, dl);
+
+ return Cvt;
}
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
@@ -54954,6 +55078,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
case X86ISD::VBROADCAST_LOAD:
case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 48da7b3ac882..c105bde78ad1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3769,12 +3769,16 @@ let Predicates = [HasAVX512] in {
(VMOVDQA64Zrm addr:$src)>;
def : Pat<(alignedloadv32i16 addr:$src),
(VMOVDQA64Zrm addr:$src)>;
+ def : Pat<(alignedloadv32f16 addr:$src),
+ (VMOVAPSZrm addr:$src)>;
def : Pat<(alignedloadv64i8 addr:$src),
(VMOVDQA64Zrm addr:$src)>;
def : Pat<(loadv16i32 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
def : Pat<(loadv32i16 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
+ def : Pat<(loadv32f16 addr:$src),
+ (VMOVUPSZrm addr:$src)>;
def : Pat<(loadv64i8 addr:$src),
(VMOVDQU64Zrm addr:$src)>;
@@ -3783,12 +3787,16 @@ let Predicates = [HasAVX512] in {
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
+ (VMOVAPSZmr addr:$dst, VR512:$src)>;
def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
(VMOVDQA64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v16i32 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
def : Pat<(store (v32i16 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v32f16 VR512:$src), addr:$dst),
+ (VMOVUPSZmr addr:$dst, VR512:$src)>;
def : Pat<(store (v64i8 VR512:$src), addr:$dst),
(VMOVDQU64Zmr addr:$dst, VR512:$src)>;
}
@@ -3799,12 +3807,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z128rm addr:$src)>;
def : Pat<(alignedloadv8i16 addr:$src),
(VMOVDQA64Z128rm addr:$src)>;
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVAPSZ128rm addr:$src)>;
def : Pat<(alignedloadv16i8 addr:$src),
(VMOVDQA64Z128rm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
def : Pat<(loadv8i16 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVUPSZ128rm addr:$src)>;
def : Pat<(loadv16i8 addr:$src),
(VMOVDQU64Z128rm addr:$src)>;
@@ -3813,12 +3825,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
+ (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
(VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
+ (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
(VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
@@ -3827,12 +3843,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z256rm addr:$src)>;
def : Pat<(alignedloadv16i16 addr:$src),
(VMOVDQA64Z256rm addr:$src)>;
+ def : Pat<(alignedloadv16f16 addr:$src),
+ (VMOVAPSZ256rm addr:$src)>;
def : Pat<(alignedloadv32i8 addr:$src),
(VMOVDQA64Z256rm addr:$src)>;
def : Pat<(loadv8i32 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
def : Pat<(loadv16i16 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
+ def : Pat<(loadv16f16 addr:$src),
+ (VMOVUPSZ256rm addr:$src)>;
def : Pat<(loadv32i8 addr:$src),
(VMOVDQU64Z256rm addr:$src)>;
@@ -3841,12 +3861,16 @@ let Predicates = [HasVLX] in {
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
+ (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
(VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
+ (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
@@ -3855,16 +3879,12 @@ let Predicates = [HasBWI] in {
(VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>;
- def : Pat<(v32f16 (alignedloadv32f16 addr:$src)),
- (VMOVAPSZrm addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))),
(VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
- def : Pat<(v32f16 (loadv32f16 addr:$src)),
- (VMOVUPSZrm addr:$src)>;
def : Pat<(v32f16 (vselect VK32WM:$mask,
(v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))),
(VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
@@ -3878,10 +3898,6 @@ let Predicates = [HasBWI] in {
def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)),
(VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
- (VMOVAPSZmr addr:$dst, VR512:$src)>;
- def : Pat<(store (v32f16 VR512:$src), addr:$dst),
- (VMOVUPSZmr addr:$dst, VR512:$src)>;
def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
(VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
}
@@ -3890,16 +3906,12 @@ let Predicates = [HasBWI, HasVLX] in {
(VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>;
- def : Pat<(v16f16 (alignedloadv16f16 addr:$src)),
- (VMOVAPSZ256rm addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
- def : Pat<(v16f16 (loadv16f16 addr:$src)),
- (VMOVUPSZ256rm addr:$src)>;
def : Pat<(v16f16 (vselect VK16WM:$mask,
(v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
@@ -3913,10 +3925,6 @@ let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)),
(VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
- (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
- def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
- (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask),
(VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>;
@@ -3924,16 +3932,12 @@ let Predicates = [HasBWI, HasVLX] in {
(VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>;
def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>;
- def : Pat<(v8f16 (alignedloadv8f16 addr:$src)),
- (VMOVAPSZ128rm addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
(VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(v8f16 (loadv8f16 addr:$src)),
- (VMOVUPSZ128rm addr:$src)>;
def : Pat<(v8f16 (vselect VK8WM:$mask,
(v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
(VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
@@ -3947,10 +3951,6 @@ let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)),
(VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
- (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
- def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
- (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask),
(VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>;
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ec32ac2acad1..74ef831e1658 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -742,8 +742,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
return isPICBase;
}
-bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const {
+bool X86InstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
@@ -869,7 +869,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- MI.isDereferenceableInvariantLoad(AA)) {
+ MI.isDereferenceableInvariantLoad()) {
Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
@@ -3892,6 +3892,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ "Load size exceeds stack slot");
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILELOADD;
// tileloadd (%sp, %idx), %tmm
@@ -3913,8 +3917,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
} else {
- const MachineFunction &MF = *MBB.getParent();
- const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 4943d2152fd2..98da00c39bdb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -240,8 +240,7 @@ public:
unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const override;
- bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AAResults *AA) const override;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 06cb280e860a..c5557bd5df4e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -140,6 +140,7 @@ def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
let Predicates = [NoAVX512] in {
def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
@@ -159,6 +160,7 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
let Predicates = [NoAVX512] in {
def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
@@ -572,6 +574,23 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v32i8 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVAPSrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVUPSrm addr:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedloadv16f16 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv16f16 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16f16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
}
// Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -613,6 +632,17 @@ let Predicates = [UseSSE1] in {
(MOVUPSmr addr:$dst, VR128:$src)>;
}
+let Predicates = [UseSSE2] in {
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (MOVUPSrm addr:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
@@ -3136,6 +3166,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
+ (VMOVNTDQYmr addr:$dst, VR256:$src)>;
def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
(VMOVNTDQYmr addr:$dst, VR256:$src)>;
@@ -3143,6 +3175,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
(VMOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
(VMOVNTDQmr addr:$dst, VR128:$src)>;
}
@@ -3152,6 +3186,8 @@ let Predicates = [UseSSE2] in {
(MOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
(MOVNTDQmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+ (MOVNTDQmr addr:$dst, VR128:$src)>;
def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
(MOVNTDQmr addr:$dst, VR128:$src)>;
}
@@ -3374,12 +3410,16 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDQArm addr:$src)>;
def : Pat<(alignedloadv8i16 addr:$src),
(VMOVDQArm addr:$src)>;
+ def : Pat<(alignedloadv8f16 addr:$src),
+ (VMOVDQArm addr:$src)>;
def : Pat<(alignedloadv16i8 addr:$src),
(VMOVDQArm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
(VMOVDQUrm addr:$src)>;
def : Pat<(loadv8i16 addr:$src),
(VMOVDQUrm addr:$src)>;
+ def : Pat<(loadv8f16 addr:$src),
+ (VMOVDQUrm addr:$src)>;
def : Pat<(loadv16i8 addr:$src),
(VMOVDQUrm addr:$src)>;
@@ -3387,12 +3427,16 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(VMOVDQAmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+ (VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
(VMOVDQAmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+ (VMOVDQUmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
}
@@ -6431,6 +6475,8 @@ let Predicates = [HasAVX2, NoVLX] in {
(VMOVNTDQAYrm addr:$src)>;
def : Pat<(v16i16 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;
+ def : Pat<(v16f16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQAYrm addr:$src)>;
def : Pat<(v32i8 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;
}
@@ -6446,6 +6492,8 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVNTDQArm addr:$src)>;
def : Pat<(v8i16 (alignednontemporalload addr:$src)),
(VMOVNTDQArm addr:$src)>;
+ def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+ (VMOVNTDQArm addr:$src)>;
def : Pat<(v16i8 (alignednontemporalload addr:$src)),
(VMOVNTDQArm addr:$src)>;
}
@@ -6461,6 +6509,8 @@ let Predicates = [UseSSE41] in {
(MOVNTDQArm addr:$src)>;
def : Pat<(v8i16 (alignednontemporalload addr:$src)),
(MOVNTDQArm addr:$src)>;
+ def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+ (MOVNTDQArm addr:$src)>;
def : Pat<(v16i8 (alignednontemporalload addr:$src)),
(MOVNTDQArm addr:$src)>;
}
@@ -7050,6 +7100,8 @@ def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
+def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128 addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
}
@@ -7095,6 +7147,7 @@ let Predicates = [HasAVX1Only] in {
defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
}
@@ -7150,6 +7203,8 @@ let Predicates = [HasAVX1Only] in {
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
+ defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
+ defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
}
@@ -7189,6 +7244,8 @@ let Predicates = [HasAVX1Only] in {
defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
}
@@ -7503,6 +7560,10 @@ def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0))
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
+ (VBLENDPSYrri VR256:$src1,
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src2, sub_xmm), 0xf)>;
def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
@@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0
def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
(VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
(VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
@@ -7759,6 +7823,8 @@ let Predicates = [HasAVX2] in {
defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
}
@@ -7781,6 +7847,8 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>;
+ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>;
+ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
}
@@ -7801,6 +7869,8 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index f4e25e4194db..1de2a1725954 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -254,8 +254,12 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
StringRef CPU =
CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
- StringRef TuneCPU =
- TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+ // "x86-64" is a default target setting for many front ends. In these cases,
+ // they actually request for "generic" tuning unless the "tune-cpu" was
+ // specified.
+ StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString()
+ : CPU == "x86-64" ? "generic"
+ : (StringRef)CPU;
StringRef FS =
FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b36f8a3d06d0..b27aac9c4e93 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1297,29 +1297,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
LT.first = NumOfDests * NumOfShufflesPerDest;
}
- static const CostTblEntry AVX512FP16ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
-
- {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
- {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
- {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
-
- {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
-
- {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
- };
-
- if (!ST->useSoftFloat() && ST->hasFP16())
- if (const auto *Entry =
- CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
- return LT.first * Entry->Cost;
-
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
@@ -1339,17 +1316,22 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
static const CostTblEntry AVX512BWShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
@@ -1369,6 +1351,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
@@ -1376,6 +1359,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
+ {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
{TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
{TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
@@ -1408,11 +1392,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// FIXME: This just applies the type legalization cost rules above
// assuming these completely split.
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14},
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42},
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
{TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
{TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
{TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
{TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
@@ -1430,6 +1417,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
@@ -1437,9 +1425,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
{TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
{TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+ {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
{TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
@@ -1448,6 +1438,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
+ // + vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
@@ -1457,6 +1449,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
+ {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
};
@@ -1493,6 +1487,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
{TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
{TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
@@ -1501,6 +1496,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
+ {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
+ // + vinsertf128
{TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
@@ -1509,6 +1506,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v8i32, 1}, // vblendps
{TTI::SK_Select, MVT::v8f32, 1}, // vblendps
{TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
{TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
@@ -1517,6 +1515,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
@@ -1526,6 +1526,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
};
@@ -1540,6 +1542,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v4i32, 1}, // pblendw
{TTI::SK_Select, MVT::v4f32, 1}, // blendps
{TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+ {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
{TTI::SK_Select, MVT::v16i8, 1} // pblendvb
};
@@ -1549,18 +1552,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
static const CostTblEntry SSSE3ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
{TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
{TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
{TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
{TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
{TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+ {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
{TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+ {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
};
@@ -1573,12 +1581,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
{TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
{TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
{TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
{TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
@@ -1586,6 +1596,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Select, MVT::v2f64, 1}, // movsd
{TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
{TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+ {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
{TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
{TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
@@ -1593,6 +1604,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
// + pshufd/unpck
+ {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + 2*packus
@@ -1600,6 +1613,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
+ { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
@@ -5219,7 +5233,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
return true;
- if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
+ if (ScalarTy->isHalfTy() && ST->hasBWI())
return true;
if (!ScalarTy->isIntegerTy())
@@ -5674,8 +5688,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
- if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
- (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
return HasBW;
return false;
};