diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-24 15:03:44 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-24 15:03:44 +0000 |
| commit | 4b4fe385e49bd883fd183b5f21c1ea486c722e61 (patch) | |
| tree | c3d8fdb355c9c73e57723718c22103aaf7d15aa6 /llvm/lib/Target/X86 | |
| parent | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff) | |
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86FixupBWInsts.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 561 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 48 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 70 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 65 |
9 files changed, 497 insertions, 284 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a859176220c7..fa0a6bd415dc 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1277,7 +1277,7 @@ class ProcModel<string Name, SchedMachineModel Model, // enabled. It has no effect on code generation. // NOTE: As a default tuning, "generic" aims to produce code optimized for the // most common X86 processors. The tunings might be changed over time. It is -// recommended to use "x86-64" in lit tests for consistency. +// recommended to use "tune-cpu"="x86-64" in function attribute for consistency. def : ProcModel<"generic", SandyBridgeModel, [FeatureX87, FeatureCX8, FeatureX86_64], [TuningSlow3OpsLEA, diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp index 16bff201dd03..db6923416177 100644 --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, switch (MI->getOpcode()) { case X86::MOV8rm: - // Only replace 8 bit loads with the zero extending versions if - // in an inner most loop and not optimizing for size. This takes - // an extra byte to encode, and provides limited performance upside. - if (MachineLoop *ML = MLI->getLoopFor(&MBB)) - if (ML->begin() == ML->end() && !OptForSize) - return tryReplaceLoad(X86::MOVZX32rm8, MI); + // Replace 8-bit loads with the zero-extending version if not optimizing + // for size. The extending op is cheaper across a wide range of uarch and + // it avoids a potentially expensive partial register stall. It takes an + // extra byte to encode, however, so don't do this when optimizing for size. + if (!OptForSize) + return tryReplaceLoad(X86::MOVZX32rm8, MI); break; case X86::MOV16rm: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 12af6087cb47..5a4533c4bac4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -555,6 +555,39 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + auto setF16Action = [&] (MVT VT, LegalizeAction Action) { + setOperationAction(ISD::FABS, VT, Action); + setOperationAction(ISD::FNEG, VT, Action); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FREM, VT, Action); + setOperationAction(ISD::FMA, VT, Action); + setOperationAction(ISD::FMINNUM, VT, Action); + setOperationAction(ISD::FMAXNUM, VT, Action); + setOperationAction(ISD::FMINIMUM, VT, Action); + setOperationAction(ISD::FMAXIMUM, VT, Action); + setOperationAction(ISD::FSIN, VT, Action); + setOperationAction(ISD::FCOS, VT, Action); + setOperationAction(ISD::FSINCOS, VT, Action); + setOperationAction(ISD::FSQRT, VT, Action); + setOperationAction(ISD::FPOW, VT, Action); + setOperationAction(ISD::FLOG, VT, Action); + setOperationAction(ISD::FLOG2, VT, Action); + setOperationAction(ISD::FLOG10, VT, Action); + setOperationAction(ISD::FEXP, VT, Action); + setOperationAction(ISD::FEXP2, VT, Action); + setOperationAction(ISD::FCEIL, VT, Action); + setOperationAction(ISD::FFLOOR, VT, Action); + setOperationAction(ISD::FNEARBYINT, VT, Action); + setOperationAction(ISD::FRINT, VT, Action); + setOperationAction(ISD::BR_CC, VT, Action); + setOperationAction(ISD::SETCC, VT, Action); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SELECT_CC, VT, Action); + setOperationAction(ISD::FROUND, VT, Action); + setOperationAction(ISD::FROUNDEVEN, VT, Action); + setOperationAction(ISD::FTRUNC, VT, Action); + }; + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f16, f32 and f64 use SSE. // Set up the FP register classes. @@ -592,40 +625,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Half type will be promoted by default. - setOperationAction(ISD::FABS, MVT::f16, Promote); - setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setF16Action(MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::BR_CC, MVT::f16, Promote); - setOperationAction(ISD::SETCC, MVT::f16, Promote); - setOperationAction(ISD::SELECT, MVT::f16, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); @@ -1003,6 +1007,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); + addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass @@ -1084,7 +1090,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - for (auto VT : { MVT::v2f64, MVT::v2i64 }) { + for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1095,19 +1101,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); + setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. @@ -1118,8 +1130,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); @@ -1304,6 +1316,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); + addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass @@ -1340,12 +1354,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); @@ -1356,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); @@ -1386,6 +1401,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); + setOperationAction(ISD::SELECT, MVT::v16f16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); @@ -1507,7 +1523,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v8f32, MVT::v4f64 }) { + MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1518,6 +1534,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } + setF16Action(MVT::v16f16, Expand); + setOperationAction(ISD::FADD, MVT::v16f16, Expand); + setOperationAction(ISD::FSUB, MVT::v16f16, Expand); + setOperationAction(ISD::FMUL, MVT::v16f16, Expand); + setOperationAction(ISD::FDIV, MVT::v16f16, Expand); if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); @@ -1532,11 +1553,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && + Subtarget.hasF16C()) { + for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + } + for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) { + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); + } + for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); + setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); + } + + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); } // This block controls legalization of the mask vector sizes that are @@ -1619,6 +1652,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { @@ -1645,14 +1679,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } - setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); @@ -1664,7 +1700,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); @@ -1799,15 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHR, MVT::v16i32, Custom); if (Subtarget.hasDQI()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); - + for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) + setOperationAction(Opc, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1831,7 +1861,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, - MVT::v16f32, MVT::v8f64 }) { + MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); @@ -1842,6 +1872,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v32f16, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); + setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); + } for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); @@ -1881,23 +1920,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // These operations are handled on non-VLX by artificially widening in // isel patterns. - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. @@ -1934,25 +1959,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { - for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::UINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_SINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_UINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::MUL, VT, Legal); + for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { + setOperationAction(Opc, MVT::v2i64, Custom); + setOperationAction(Opc, MVT::v4i64, Custom); } + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); } if (Subtarget.hasCDI()) { @@ -2052,7 +2066,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // AVX512_FP16 scalar operations setGroup(MVT::f16); - addRegisterClass(MVT::f16, &X86::FR16XRegClass); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); @@ -2066,6 +2079,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); @@ -2073,14 +2087,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); - addRegisterClass(MVT::v32f16, &X86::VR512RegClass); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); @@ -2112,8 +2129,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasVLX()) { - addRegisterClass(MVT::v8f16, &X86::VR128XRegClass); - addRegisterClass(MVT::v16f16, &X86::VR256XRegClass); setGroup(MVT::v8f16); setGroup(MVT::v16f16); @@ -2132,8 +2147,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); @@ -2347,7 +2366,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FP16_TO_FP, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, - ISD::FP_ROUND}); + ISD::FP_ROUND, + ISD::STRICT_FP_ROUND}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -2404,6 +2424,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && + !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + return TypeSplitVector; + + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -2447,22 +2471,21 @@ handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return RegisterVT; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register type is f16. We override the type to v8f16 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return MVT::v8f16; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return MVT::v8f16; + } // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && @@ -2475,22 +2498,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return NumRegisters; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register number is 3. We override the number to 1 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return 1; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return 1; + } // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if // x87 is disabled. @@ -9646,13 +9668,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || - (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || + CVT == MVT::f16 || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) @@ -14129,6 +14151,16 @@ static bool isShuffleFoldableLoad(SDValue V) { ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } +template<typename T> +static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { + return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16(); +} + +template<typename T> +bool X86TargetLowering::isSoftFP16(T VT) const { + return ::isSoftFP16(VT, Subtarget); +} + /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower @@ -14140,6 +14172,9 @@ static SDValue lowerShuffleAsElementInsertion( MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); + int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); @@ -19444,6 +19479,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + if (isSoftFP16(VT)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, + DAG.getBitcast(NVT, LHS), + DAG.getBitcast(NVT, RHS))); + } + // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && @@ -19467,8 +19511,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); @@ -20856,16 +20898,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, return Cvt; } -template<typename T> -static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { - return VT == MVT::f16 && !Subtarget.hasFP16(); -} - -template<typename T> -bool X86TargetLowering::isSoftFP16(T VT) const { - return ::isSoftFP16(VT, Subtarget); -} - static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); @@ -20885,6 +20917,26 @@ static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd); } +static bool isLegalConversion(MVT VT, bool IsSigned, + const X86Subtarget &Subtarget) { + if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned) + return true; + if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned) + return true; + if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32)) + return true; + if (Subtarget.useAVX512Regs()) { + if (VT == MVT::v16i32) + return true; + if (VT == MVT::v8i64 && Subtarget.hasDQI()) + return true; + } + if (Subtarget.hasDQI() && Subtarget.hasVLX() && + (VT == MVT::v2i64 || VT == MVT::v4i64)) + return true; + return false; +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -20897,6 +20949,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, if (isSoftFP16(VT)) return promoteXINT_TO_FP(Op, DAG); + else if (isLegalConversion(SrcVT, true, Subtarget)) + return Op; if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) return LowerWin64_INT128_TO_FP(Op, DAG); @@ -21400,6 +21454,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (isSoftFP16(DstVT)) return promoteXINT_TO_FP(Op, DAG); + else if (isLegalConversion(SrcVT, false, Subtarget)) + return Op; if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -22229,6 +22285,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { {NVT, MVT::Other}, {Chain, Src})}); return DAG.getNode(Op.getOpcode(), dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); + } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) { + return Op; } if (VT.isVector()) { @@ -22826,7 +22884,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return Op; if (SVT.getVectorElementType() == MVT::f16) { - assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); + assert(Subtarget.hasF16C() && "Unexpected features!"); if (SVT == MVT::v2f16) In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, DAG.getUNDEF(MVT::v2f16)); @@ -22836,6 +22894,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, {Op->getOperand(0), Res}); return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); + } else if (VT == MVT::v4f64 || VT == MVT::v8f64) { + return Op; } assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -22854,34 +22914,19 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); - SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) return SDValue(); - if (VT == MVT::f16) { - if (Subtarget.hasFP16()) - return Op; - - if (SVT != MVT::f32) { - if (IsStrict) - return DAG.getNode( - ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, - {Chain, - DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other}, - {Chain, In, Op2}), - Op2}); - - return DAG.getNode(ISD::FP_ROUND, DL, VT, - DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2), - Op2); - } - - if (!Subtarget.hasF16C()) + if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { + if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32) return SDValue(); + if (VT.isVector()) + return Op; + SDValue Res; SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, MVT::i32); @@ -24176,10 +24221,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); if (isFP) { -#ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); -#endif + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); @@ -24741,6 +24786,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get(); + if (isSoftFP16(Op0.getValueType())) + return SDValue(); + // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { @@ -24931,10 +24979,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op1.getSimpleValueType(); SDValue CC; - if (isSoftFP16(VT)) - return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, - DAG.getBitcast(MVT::i16, Op1), - DAG.getBitcast(MVT::i16, Op2))); + if (isSoftFP16(VT)) { + MVT NVT = VT.changeTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, Op1), + DAG.getBitcast(NVT, Op2))); + } // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. @@ -27268,27 +27318,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } - case Intrinsic::swift_async_context_addr: { - auto &MF = DAG.getMachineFunction(); - auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - MF.getFrameInfo().setFrameAddressIsTaken(true); - X86FI->setHasSwiftAsyncContext(true); - return SDValue( - DAG.getMachineNode( - X86::SUB64ri8, dl, MVT::i64, - DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64), - DAG.getTargetConstant(8, dl, MVT::i32)), - 0); - } else { - // 32-bit so no special extended frame, create or reuse an existing stack - // slot. - if (!X86FI->getSwiftAsyncContextFrameIdx()) - X86FI->setSwiftAsyncContextFrameIdx( - MF.getFrameInfo().CreateStackObject(4, Align(4), false)); - return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32); - } - } case Intrinsic::x86_avx512_vp2intersect_q_512: case Intrinsic::x86_avx512_vp2intersect_q_256: case Intrinsic::x86_avx512_vp2intersect_q_128: @@ -27668,6 +27697,37 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { + + case Intrinsic::swift_async_context_addr: { + SDLoc dl(Op); + auto &MF = DAG.getMachineFunction(); + auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); + if (Subtarget.is64Bit()) { + MF.getFrameInfo().setFrameAddressIsTaken(true); + X86FI->setHasSwiftAsyncContext(true); + SDValue Chain = Op->getOperand(0); + SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64); + SDValue Result = + SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP, + DAG.getTargetConstant(8, dl, MVT::i32)), + 0); + // Return { result, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + CopyRBP.getValue(1)); + } else { + // 32-bit so no special extended frame, create or reuse an existing + // stack slot. + if (!X86FI->getSwiftAsyncContextFrameIdx()) + X86FI->setSwiftAsyncContextFrameIdx( + MF.getFrameInfo().CreateStackObject(4, Align(4), false)); + SDValue Result = + DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32); + // Return { result, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + Op->getOperand(0)); + } + } + case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: @@ -32901,20 +32961,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { bool IsStrict = N->isStrictFPOpcode(); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); + SDValue Rnd = N->getOperand(IsStrict ? 2 : 1); + EVT SrcVT = Src.getValueType(); EVT VT = N->getValueType(0); - EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; + SDValue V; if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) : DAG.getUNDEF(MVT::v2f32); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); } + if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { + assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C"); + if (SrcVT.getVectorElementType() != MVT::f32) + return; + + if (IsStrict) + V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, + {Chain, Src, Rnd}); + else + V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd); + + Results.push_back(DAG.getBitcast(MVT::v8f16, V)); + if (IsStrict) + Results.push_back(V.getValue(1)); + return; + } if (!isTypeLegal(Src.getValueType())) return; - SDValue V; + EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, - {N->getOperand(0), Src}); + {Chain, Src}); else V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); Results.push_back(V); @@ -37342,6 +37421,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, bool IsUnary) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); + unsigned SizeInBits = MaskVT.getSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && @@ -37409,7 +37489,10 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to match against a OR if we're performing a blend shuffle and the // non-blended source element is zero in each case. - if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits. + if (SizeInBits == V1.getValueSizeInBits() && + SizeInBits == V2.getValueSizeInBits() && + (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { bool IsBlend = true; unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); @@ -39652,11 +39735,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SmallVector<int, 4> Mask; unsigned Opcode = N.getOpcode(); - // FIXME: Remove this after we support vector FP16 - if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(), - Subtarget)) - return SDValue(); - if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; @@ -40947,12 +41025,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( EltBits)) { OpBits.clearAllBits(); OpElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) || - (!Invert && !EltBits[I].isZero()))) { + for (int I = 0; I != NumElts; ++I) { + if (!DemandedElts[I]) + continue; + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + OpBits.setAllBits(); + OpElts.setBit(I); + } else if ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero())) { OpBits |= Invert ? ~EltBits[I] : EltBits[I]; OpElts.setBit(I); } + } } return std::make_pair(OpBits, OpElts); }; @@ -44715,7 +44801,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } // Early exit check - if (!TLI.isTypeLegal(VT)) + if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) @@ -47798,11 +47884,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if (!EltBits[I].isZero()) { + for (int I = 0; I != NumElts; ++I) { + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + DemandedBits.setAllBits(); + DemandedElts.setBit(I); + } else if (!EltBits[I].isZero()) { DemandedBits |= EltBits[I]; DemandedElts.setBit(I); } + } } return std::make_pair(DemandedBits, DemandedElts); }; @@ -51042,6 +51134,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); MVT VT = N->getSimpleValueType(0); + int NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); // ANDNP(undef, x) -> 0 // ANDNP(x, undef) -> 0 @@ -51060,6 +51154,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, if (SDValue Not = IsNOT(N0, DAG)) return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1); + // Constant Folding + APInt Undefs0, Undefs1; + SmallVector<APInt> EltBits0, EltBits1; + if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0) && + getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) { + SDLoc DL(N); + SmallVector<APInt> ResultBits; + for (int I = 0; I != NumElts; ++I) + ResultBits.push_back(~EltBits0[I] & EltBits1[I]); + APInt ResultUndefs = APInt::getZero(NumElts); + return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL); + } + // TODO: Constant fold NOT(N0) to allow us to use AND. // TODO: Do this in IsNOT with suitable oneuse checks? @@ -51074,20 +51181,24 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { APInt UndefElts; SmallVector<APInt> EltBits; - int NumElts = VT.getVectorNumElements(); - int EltSizeInBits = VT.getScalarSizeInBits(); APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); APInt DemandedElts = APInt::getAllOnes(NumElts); if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if ((Invert && !EltBits[I].isAllOnes()) || - (!Invert && !EltBits[I].isZero())) { + for (int I = 0; I != NumElts; ++I) { + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + DemandedBits.setAllBits(); + DemandedElts.setBit(I); + } else if ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero())) { DemandedBits |= Invert ? ~EltBits[I] : EltBits[I]; DemandedElts.setBit(I); } + } } return std::make_pair(DemandedBits, DemandedElts); }; @@ -54714,8 +54825,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasFP16()) return SDValue(); + bool IsStrict = N->isStrictFPOpcode(); EVT VT = N->getValueType(0); - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || @@ -54736,8 +54848,15 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, // Destination is v8i16 with at least 8 elements. EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts)); - SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, - DAG.getTargetConstant(4, dl, MVT::i32)); + SDValue Cvt, Chain; + SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32); + if (IsStrict) { + Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other}, + {N->getOperand(0), Src, Rnd}); + Chain = Cvt.getValue(1); + } else { + Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd); + } // Extract down to real number of elements. if (NumElts < 8) { @@ -54746,7 +54865,12 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, DAG.getIntPtrConstant(0, dl)); } - return DAG.getBitcast(VT, Cvt); + Cvt = DAG.getBitcast(VT, Cvt); + + if (IsStrict) + return DAG.getMergeValues({Cvt, Chain}, dl); + + return Cvt; } static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { @@ -54954,6 +55078,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); case X86ISD::VBROADCAST_LOAD: case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 48da7b3ac882..c105bde78ad1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3769,12 +3769,16 @@ let Predicates = [HasAVX512] in { (VMOVDQA64Zrm addr:$src)>; def : Pat<(alignedloadv32i16 addr:$src), (VMOVDQA64Zrm addr:$src)>; + def : Pat<(alignedloadv32f16 addr:$src), + (VMOVAPSZrm addr:$src)>; def : Pat<(alignedloadv64i8 addr:$src), (VMOVDQA64Zrm addr:$src)>; def : Pat<(loadv16i32 addr:$src), (VMOVDQU64Zrm addr:$src)>; def : Pat<(loadv32i16 addr:$src), (VMOVDQU64Zrm addr:$src)>; + def : Pat<(loadv32f16 addr:$src), + (VMOVUPSZrm addr:$src)>; def : Pat<(loadv64i8 addr:$src), (VMOVDQU64Zrm addr:$src)>; @@ -3783,12 +3787,16 @@ let Predicates = [HasAVX512] in { (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v16i32 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v32i16 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32f16 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(store (v64i8 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; } @@ -3799,12 +3807,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z128rm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSZ128rm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQA64Z128rm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQU64Z128rm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQU64Z128rm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSZ128rm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQU64Z128rm addr:$src)>; @@ -3813,12 +3825,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v4i32 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v8i16 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8f16 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; @@ -3827,12 +3843,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z256rm addr:$src)>; def : Pat<(alignedloadv16i16 addr:$src), (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSZ256rm addr:$src)>; def : Pat<(alignedloadv32i8 addr:$src), (VMOVDQA64Z256rm addr:$src)>; def : Pat<(loadv8i32 addr:$src), (VMOVDQU64Z256rm addr:$src)>; def : Pat<(loadv16i16 addr:$src), (VMOVDQU64Z256rm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSZ256rm addr:$src)>; def : Pat<(loadv32i8 addr:$src), (VMOVDQU64Z256rm addr:$src)>; @@ -3841,12 +3861,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), + (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v8i32 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v16i16 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16f16 VR256X:$src), addr:$dst), + (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } @@ -3855,16 +3879,12 @@ let Predicates = [HasBWI] in { (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>; - def : Pat<(v32f16 (alignedloadv32f16 addr:$src)), - (VMOVAPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (loadv32f16 addr:$src)), - (VMOVUPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; @@ -3878,10 +3898,6 @@ let Predicates = [HasBWI] in { def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), - (VMOVAPSZmr addr:$dst, VR512:$src)>; - def : Pat<(store (v32f16 VR512:$src), addr:$dst), - (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } @@ -3890,16 +3906,12 @@ let Predicates = [HasBWI, HasVLX] in { (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>; - def : Pat<(v16f16 (alignedloadv16f16 addr:$src)), - (VMOVAPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (loadv16f16 addr:$src)), - (VMOVUPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; @@ -3913,10 +3925,6 @@ let Predicates = [HasBWI, HasVLX] in { def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), - (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; - def : Pat<(store (v16f16 VR256X:$src), addr:$dst), - (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask), (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>; @@ -3924,16 +3932,12 @@ let Predicates = [HasBWI, HasVLX] in { (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>; - def : Pat<(v8f16 (alignedloadv8f16 addr:$src)), - (VMOVAPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (loadv8f16 addr:$src)), - (VMOVUPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; @@ -3947,10 +3951,6 @@ let Predicates = [HasBWI, HasVLX] in { def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), - (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; - def : Pat<(store (v8f16 VR128X:$src), addr:$dst), - (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask), (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ec32ac2acad1..74ef831e1658 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -742,8 +742,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const { +bool X86InstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable @@ -869,7 +869,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - MI.isDereferenceableInvariantLoad(AA)) { + MI.isDereferenceableInvariantLoad()) { Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; @@ -3892,6 +3892,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + "Load size exceeds stack slot"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILELOADD; // tileloadd (%sp, %idx), %tmm @@ -3913,8 +3917,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } else { - const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 4943d2152fd2..98da00c39bdb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -240,8 +240,7 @@ public: unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 06cb280e860a..c5557bd5df4e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -140,6 +140,7 @@ def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", let Predicates = [NoAVX512] in { def : Pat<(v16i8 immAllZerosV), (V_SET0)>; def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v8f16 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; def : Pat<(v2i64 immAllZerosV), (V_SET0)>; def : Pat<(v2f64 immAllZerosV), (V_SET0)>; @@ -159,6 +160,7 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", let Predicates = [NoAVX512] in { def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; @@ -572,6 +574,23 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16f16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -613,6 +632,17 @@ let Predicates = [UseSSE1] in { (MOVUPSmr addr:$dst, VR128:$src)>; } +let Predicates = [UseSSE2] in { + def : Pat<(alignedloadv8f16 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -3136,6 +3166,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; @@ -3143,6 +3175,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3152,6 +3186,8 @@ let Predicates = [UseSSE2] in { (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3374,12 +3410,16 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQArm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQUrm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQUrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVDQUrm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQUrm addr:$src)>; @@ -3387,12 +3427,16 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(store (v4i32 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v8i16 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v16i8 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; } @@ -6431,6 +6475,8 @@ let Predicates = [HasAVX2, NoVLX] in { (VMOVNTDQAYrm addr:$src)>; def : Pat<(v16i16 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v16f16 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; def : Pat<(v32i8 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; } @@ -6446,6 +6492,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; } @@ -6461,6 +6509,8 @@ let Predicates = [UseSSE41] in { (MOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; } @@ -7050,6 +7100,8 @@ def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; +def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; } @@ -7095,6 +7147,7 @@ let Predicates = [HasAVX1Only] in { defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; } @@ -7150,6 +7203,8 @@ let Predicates = [HasAVX1Only] in { defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7189,6 +7244,8 @@ let Predicates = [HasAVX1Only] in { defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; } @@ -7503,6 +7560,10 @@ def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)) (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), @@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; @@ -7759,6 +7823,8 @@ let Predicates = [HasAVX2] in { defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; + defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; } @@ -7781,6 +7847,8 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7801,6 +7869,8 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index f4e25e4194db..1de2a1725954 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -254,8 +254,12 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { StringRef CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU; - StringRef TuneCPU = - TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU; + // "x86-64" is a default target setting for many front ends. In these cases, + // they actually request for "generic" tuning unless the "tune-cpu" was + // specified. + StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString() + : CPU == "x86-64" ? "generic" + : (StringRef)CPU; StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b36f8a3d06d0..b27aac9c4e93 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,29 +1297,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, LT.first = NumOfDests * NumOfShufflesPerDest; } - static const CostTblEntry AVX512FP16ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw - - {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w - }; - - if (!ST->useSoftFloat() && ST->hasFP16()) - if (const auto *Entry = - CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; - static const CostTblEntry AVX512VBMIShuffleTbl[] = { {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb @@ -1339,17 +1316,22 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, static const CostTblEntry AVX512BWShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 @@ -1369,6 +1351,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd @@ -1376,6 +1359,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca + {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd @@ -1408,11 +1392,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // FIXME: This just applies the type legalization cost rules above // assuming these completely split. {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14}, {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42}, {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq + {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps @@ -1430,6 +1417,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd @@ -1437,9 +1425,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb + {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb + {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd @@ -1448,6 +1438,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb + // + vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb @@ -1457,6 +1449,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb }; @@ -1493,6 +1487,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd @@ -1501,6 +1496,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb // + vinsertf128 + {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb + // + vinsertf128 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb // + vinsertf128 @@ -1509,6 +1506,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v8i32, 1}, // vblendps {TTI::SK_Select, MVT::v8f32, 1}, // vblendps {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor + {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd @@ -1517,6 +1515,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 @@ -1526,6 +1526,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 }; @@ -1540,6 +1542,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v4i32, 1}, // pblendw {TTI::SK_Select, MVT::v4f32, 1}, // blendps {TTI::SK_Select, MVT::v8i16, 1}, // pblendw + {TTI::SK_Select, MVT::v8f16, 1}, // pblendw {TTI::SK_Select, MVT::v16i8, 1} // pblendvb }; @@ -1549,18 +1552,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, static const CostTblEntry SSSE3ShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb + {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb + {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por }; @@ -1573,12 +1581,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus @@ -1586,6 +1596,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v2f64, 1}, // movsd {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por + {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd @@ -1593,6 +1604,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw // + pshufd/unpck + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw + // + pshufd/unpck { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + 2*packus @@ -1600,6 +1613,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute + { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; @@ -5219,7 +5233,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; - if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) + if (ScalarTy->isHalfTy() && ST->hasBWI()) return true; if (!ScalarTy->isIntegerTy()) @@ -5674,8 +5688,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || - (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) return HasBW; return false; }; |
