summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
commite6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/X86/X86ISelLowering.cpp
parent1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)
Notes
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp9548
1 files changed, 6168 insertions, 3380 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b6a692ee187d..0b4bf687e6cf 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addBypassSlowDiv(64, 32);
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
+ if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
@@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setUseUnderscoreLongJmp(true);
}
+ // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
+ // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
+ // FIXME: Should we be limitting the atomic size on other configs? Default is
+ // 1024.
+ if (!Subtarget.hasCmpxchg8b())
+ setMaxAtomicSizeInBitsSupported(32);
+
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
@@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
- if (Subtarget.is64Bit())
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
@@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- }
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
} else {
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
@@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ else
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
+ if (!Subtarget.is64Bit())
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
@@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
+ // Disable f32->f64 extload as we can only generate this in one instruction
+ // under optsize. So its easier to pattern match (fpext (load)) for that
+ // case instead of needing to emit 2 instructions for extload in the
+ // non-optsize case.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
@@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
+ setOperationAction(ISD::LROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Expand);
+ setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
// Always use a library call for pow.
@@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
if (!ExperimentalVectorWideningLegalization) {
// Use widening instead of promotion.
@@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
-
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
- setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
- setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
@@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
-
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
}
- if (ExperimentalVectorWideningLegalization) {
- // These types need custom splitting if their input is a 128-bit vector.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- }
+ // These types need custom splitting if their input is a 128-bit vector.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
@@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
- // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
- setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom);
-
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
- setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
@@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
}
if (HasInt256)
@@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
@@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
-
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
@@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
+ // to 512-bit rather than use the AVX2 instructions so that we can use
+ // k-masks.
if (!Subtarget.hasVLX()) {
- // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
- // to 512-bit rather than use the AVX2 instructions so that we can use
- // k-masks.
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
@@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
}
// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
@@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v32i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v64i8, Custom);
- setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
-
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
}
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
- if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium()))
+ if (Subtarget.is32Bit() &&
+ (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
ISD::FLOG10, ISD::FPOW, ISD::FSIN})
@@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
@@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 &&
- (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+/// For vector ops we check that the overall size isn't larger than our
+/// preferred vector width.
+EVT X86TargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX()) {
+ if (Size >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
@@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
// multiply) before we splat as a vector.
return MVT::v32i8;
}
- if (Subtarget.hasSSE2())
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
- if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
return true;
}
-bool
-X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
@@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// TODO: What about AVX-512 (512-bit) accesses?
}
}
+ // NonTemporal vector memory ops must be aligned.
+ if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+ // NT loads can only be vector aligned, so if its less aligned than the
+ // minimum vector size (which we can split the vector down to), we might as
+ // well use a regular unaligned vector load.
+ // We don't have any NT loads pre-SSE41.
+ if (!!(Flags & MachineMemOperand::MOLoad))
+ return (Align < 16 || !Subtarget.hasSSE41());
+ return false;
+ }
// Misaligned accesses of any size are always allowed.
return true;
}
@@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- auto *SecurityCheckCookie = cast<Function>(
- M.getOrInsertFunction("__security_check_cookie",
- Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext())));
- SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
- SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
+ F->setCallingConv(CallingConv::X86_FastCall);
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+ }
return;
}
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
@@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
@@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "X86GenCallingConv.inc"
-
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
@@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
"The values should reside in two registers");
SDValue Lo, Hi;
- unsigned Reg;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
@@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
- Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
+ // Swift:
+ case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
else
ValVT = VA.getValVT();
- // Calculate SP offset of interrupt parameter, re-arrange the slot normally
- // taken by a return address.
- int Offset = 0;
- if (CallConv == CallingConv::X86_INTR) {
- // X86 interrupts may take one or two arguments.
- // On the stack there will be no return address as in regular call.
- // Offset of last argument need to be set to -4/-8 bytes.
- // Where offset of the first argument out of two, should be set to 0 bytes.
- Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
- if (Subtarget.is64Bit() && Ins.size() == 2) {
- // The stack pointer needs to be realigned for 64 bit handlers with error
- // code, so the argument offset changes by 8 bytes.
- Offset += 8;
- }
- }
-
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
@@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
/*isAliased=*/true);
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
return DAG.getFrameIndex(FI, PtrVT);
}
// This is an argument in memory. We might be able to perform copy elision.
- if (Flags.isCopyElisionCandidate()) {
+ // If the argument is passed directly in memory without any extension, then we
+ // can perform copy elision. Large vector types, for example, may be passed
+ // indirectly by pointer.
+ if (Flags.isCopyElisionCandidate() &&
+ VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
@@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
- /*Immutable=*/false);
+ /*IsImmutable=*/false);
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
@@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
MFI.setObjectSExt(FI, true);
}
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
-
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
@@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
- if (CallConv == CallingConv::X86_INTR) {
- bool isLegal = Ins.size() == 1 ||
- (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
- (!Is64Bit && Ins[1].VT == MVT::i32)));
- if (!isLegal)
- report_fatal_error("X86 interrupts may take one or two arguments");
- }
-
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
// Copy all forwards from physical to virtual registers.
- for (ForwardedRegister &F : Forwards) {
+ for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
- F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
- Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
+ FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
}
}
@@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+ MachineFunction::CallSiteInfo CSInfo;
+
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
@@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (Callee->getOpcode() == ISD::GlobalAddress) {
- // If the callee is a GlobalAddress node (quite common, every direct call
- // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
- // it.
- GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
-
- // We should use extra load for direct calls to dllimported functions in
- // non-JIT mode.
- const GlobalValue *GV = G->getGlobal();
- if (!GV->hasDLLImportStorageClass()) {
- unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
-
- Callee = DAG.getTargetGlobalAddress(
- GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- // Add a wrapper.
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- // Add extra indirection
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
- }
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlags =
- Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
-
- Callee = DAG.getTargetExternalSymbol(
- S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
+ } else if (Callee->getOpcode() == ISD::GlobalAddress ||
+ Callee->getOpcode() == ISD::ExternalSymbol) {
+ // Lower direct calls to global addresses and external symbols. Setting
+ // ForCall to true here has the effect of removing WrapperRIP when possible
+ // to allow direct calls to be selected without first materializing the
+ // address into a register.
+ Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
if (HasNoCfCheck && IsCFProtectionSupported) {
@@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
@@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!IntrData)
return false;
- Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
@@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
+ Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
@@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
break;
}
+ case GATHER:
+ case GATHER_AVX2: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOLoad;
+ break;
+ }
+ case SCATTER: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOStore;
+ break;
+ }
default:
return false;
}
@@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
@@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+
+ // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
+ // those uses are extracted directly into a store, then the extract + store
+ // can be store-folded. Therefore, it's probably not worth splitting the load.
+ EVT VT = Load->getValueType(0);
+ if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
+ for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+ // Skip uses of the chain value. Result 0 of the node is the load value.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // If this use is not an extract + store, it's probably worth splitting.
+ if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
+ UI->use_begin()->getOpcode() != ISD::STORE)
+ return true;
+ }
+ // All non-chain uses are extract + store.
+ return false;
+ }
+
return true;
}
@@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
}
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
+ unsigned Opc = VecOp.getOpcode();
+
+ // Assume target opcodes can't be scalarized.
+ // TODO - do we have any exceptions?
+ if (Opc >= ISD::BUILTIN_OP_END)
+ return false;
+
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
- if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT))
+ if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
- return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT);
+ return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
+}
+
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+ // TODO: Allow vectors?
+ if (VT.isVector())
+ return false;
+ return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
@@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
- EVT BitcastVT) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
@@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
- return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+ // If both types are legal vectors, it's always ok to convert them.
+ if (LoadVT.isVector() && BitcastVT.isVector() &&
+ isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ return true;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
@@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
+ // Make sure we don't merge greater than our preferred vector
+ // width.
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
return true;
}
@@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}
-bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
+bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ EVT VT = N->getValueType(0);
+ if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
+ (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
+ // Only fold if the shift values are equal - so it folds to AND.
+ // TODO - we should fold if either is a non-uniform vector but we don't do
+ // the fold for non-splats yet.
+ return N->getOperand(1) == N->getOperand(0).getOperand(1);
+ }
+ return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+}
+
+bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
@@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
-/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size is the undef sentinel value.
+/// Return true if every element in Mask, beginning from position Pos and ending
+/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
if (Mask[i] != SM_SentinelUndef)
@@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return true;
}
+/// Return true if the mask creates a vector whose lower half is undefined.
+static bool isUndefLowerHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, 0, NumElts / 2);
+}
+
+/// Return true if the mask creates a vector whose upper half is undefined.
+static bool isUndefUpperHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
+}
+
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
@@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
DAG.getIntPtrConstant(0, dl));
}
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned WideSizeInBits) {
+ assert(Vec.getValueSizeInBits() < WideSizeInBits &&
+ (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
+ "Unsupported vector widening type");
+ unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
+ MVT SVT = Vec.getSimpleValueType().getScalarType();
+ MVT VT = MVT::getVectorVT(SVT, WideNumElts);
+ return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
+}
+
+// Helper function to collect subvector ops that are concated together,
+// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
+// The subvectors in Ops are guaranteed to be the same type.
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ assert(Ops.empty() && "Expected an empty ops vector");
+
+ if (N->getOpcode() == ISD::CONCAT_VECTORS) {
+ Ops.append(N->op_begin(), N->op_end());
+ return true;
+ }
+
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Src = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ const APInt &Idx = N->getConstantOperandAPInt(2);
+ EVT VT = Src.getValueType();
+ EVT SubVT = Sub.getValueType();
+
+ // TODO - Handle more general insert_subvector chains.
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
+ Idx == (VT.getVectorNumElements() / 2) &&
+ Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ }
+
+ return false;
+}
+
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
-// Return true if the instruction zeroes the unused upper part of the
-// destination and accepts mask.
-static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
- switch (Opcode) {
- default:
- return false;
- case X86ISD::CMPM:
- case X86ISD::CMPM_RND:
- case ISD::SETCC:
- return true;
- }
-}
-
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
return DAG.getBitcast(VT, Vec);
}
-static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
- SelectionDAG &DAG) {
+// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
+static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND_VECTOR_INREG;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND_VECTOR_INREG;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND_VECTOR_INREG;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
+static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
+ assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
+ ISD::ZERO_EXTEND == Opcode) &&
+ "Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
@@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
InVT = In.getValueType();
}
- if (VT.getVectorNumElements() == InVT.getVectorNumElements())
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
- DL, VT, In);
+ if (VT.getVectorNumElements() != InVT.getVectorNumElements())
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG,
- DL, VT, In);
+ return DAG.getNode(Opcode, DL, VT, In);
}
/// Returns a vector_shuffle node for an unpackl operation.
@@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
-// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
-static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
- while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- V = V.getOperand(0);
- return V;
-}
-
-static const Constant *getTargetConstantFromNode(SDValue Op) {
- Op = peekThroughBitcasts(Op);
-
- auto *Load = dyn_cast<LoadSDNode>(Op);
- if (!Load)
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
@@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
return CNode->getConstVal();
}
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+ return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
+}
+
+const Constant *
+X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
+ assert(LD && "Unexpected null LoadSDNode");
+ return getTargetConstantFromNode(LD);
+}
+
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
@@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
- APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
- EltBits[i] = Bits.getZExtValue();
+ EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
}
return true;
};
@@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
}
+ // Extract constant bits from a subvector broadcast.
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
+ SmallVector<APInt, 16> SubEltBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, SubEltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ UndefElts = APInt::getSplat(NumElts, UndefElts);
+ while (EltBits.size() < NumElts)
+ EltBits.append(SubEltBits.begin(), SubEltBits.end());
+ return true;
+ }
+ }
+
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return CastBitData(UndefSrcElts, SrcEltBits);
}
+ // Insert constant bits from a base and sub vector sources.
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ // TODO - support insert_subvector through bitcasts.
+ if (EltSizeInBits != VT.getScalarSizeInBits())
+ return false;
+
+ APInt UndefSubElts;
+ SmallVector<APInt, 32> EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ UndefSubElts, EltSubBits,
+ AllowWholeUndefs, AllowPartialUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, EltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ unsigned BaseIdx = Op.getConstantOperandVal(2);
+ UndefElts.insertBits(UndefSubElts, BaseIdx);
+ for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
+ EltBits[BaseIdx + i] = EltSubBits[i];
+ return true;
+ }
+ }
+
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
}
}
+// Split the demanded elts of a HADD/HSUB node between its operands.
+static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumEltsPerLane = NumElts / NumLanes;
+ int HalfEltsPerLane = NumEltsPerLane / 2;
+
+ DemandedLHS = APInt::getNullValue(NumElts);
+ DemandedRHS = APInt::getNullValue(NumElts);
+
+ // Map DemandedElts to the horizontal operands.
+ for (int Idx = 0; Idx != NumElts; ++Idx) {
+ if (!DemandedElts[Idx])
+ continue;
+ int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
+ int LocalIdx = Idx % NumEltsPerLane;
+ if (LocalIdx < HalfEltsPerLane) {
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ } else {
+ LocalIdx -= HalfEltsPerLane;
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ }
+ }
+}
+
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG);
+ SelectionDAG &DAG);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
-static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
+ SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
@@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
- assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
- "Expected byte aligned value types");
+ if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
+ return false;
+ assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned Opcode = N.getOpcode();
switch (Opcode) {
@@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::OR: {
+ // Inspect each operand at the byte level. We can merge these into a
+ // blend shuffle mask if for each byte at least one is masked out (zero).
+ KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
+ KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+ if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
+ bool IsByteMask = true;
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+ APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
+ APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
+ for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
+ unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
+ unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
+ if (LHS == 255 && RHS == 0)
+ SelectMask.setBit(i);
+ else if (LHS == 255 && RHS == 255)
+ ZeroMask.setBit(i);
+ else if (!(LHS == 0 && RHS == 255))
+ IsByteMask = false;
+ }
+ if (IsByteMask) {
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
+ for (unsigned j = 0; j != NumBytesPerElt; ++j) {
+ unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
+ int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
+ Mask.push_back(Idx);
+ }
+ }
+ Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ return true;
+ }
+ }
+
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
@@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::INSERT_SUBVECTOR: {
- // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
- // SRC0/SRC1 are both of the same valuetype VT.
- // TODO - add peekThroughOneUseBitcasts support.
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
@@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
!N->isOnlyUserOf(Sub.getNode()))
return false;
+ uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0).getValueType() == VT &&
+ isa<ConstantSDNode>(Sub.getOperand(1))) {
+ uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
+ Ops.push_back(Src);
+ Ops.push_back(Sub.getOperand(0));
+ return true;
+ }
+ // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
- if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) ||
- SubMask.size() != NumSubElts)
+ if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG))
return false;
+ if (SubMask.size() != NumSubElts) {
+ assert(((SubMask.size() % NumSubElts) == 0 ||
+ (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
+ if ((NumSubElts % SubMask.size()) == 0) {
+ int Scale = NumSubElts / SubMask.size();
+ SmallVector<int,64> ScaledSubMask;
+ scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+ SubMask = ScaledSubMask;
+ } else {
+ int Scale = SubMask.size() / NumSubElts;
+ NumSubElts = SubMask.size();
+ NumElts *= Scale;
+ InsertIdx *= Scale;
+ }
+ }
Ops.push_back(Src);
for (SDValue &SubInput : SubInputs) {
- if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- SubInput.getOperand(0).getValueType() != VT ||
- !isa<ConstantSDNode>(SubInput.getOperand(1)))
- return false;
- Ops.push_back(SubInput.getOperand(0));
+ EVT SubSVT = SubInput.getValueType().getScalarType();
+ EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
+ NumSizeInBits / SubSVT.getSizeInBits());
+ Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
+ DAG.getUNDEF(AltVT), SubInput,
+ DAG.getIntPtrConstant(0, SDLoc(N))));
}
- int InsertIdx = N.getConstantOperandVal(2);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
- int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
- M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
+ M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
}
Mask[i + InsertIdx] = M;
}
@@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
+ APInt EltsLHS, EltsRHS;
+ getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
+
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
- if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
- (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
+ if ((!N0.isUndef() &&
+ DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+ (!N1.isUndef() &&
+ DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
- (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
+ if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
+ (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
return false;
}
@@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
}
return true;
}
- case ISD::ZERO_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND: {
- // TODO - add support for VPMOVZX with smaller input vector types.
+ case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
- if (NumSizeInBits != SrcVT.getSizeInBits())
- break;
- DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ if (!SrcVT.isVector())
+ return false;
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal broadcast type");
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumSizeInBits / SrcVT.getScalarSizeInBits());
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
+ Ops.push_back(Src);
+ Mask.append(NumElts, 0);
+ return true;
+ }
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+
+ unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
+ bool IsAnyExtend =
+ (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
+ DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
Mask);
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal zero-extension type");
+ SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
+ NumSizeInBits / NumSrcBitsPerElt);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
Ops.push_back(Src);
return true;
}
@@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
}
-/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
@@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
M = SM_SentinelUndef;
// Check for unused inputs.
- if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
- UsedInputs.push_back(Inputs[i]);
+ if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
continue;
}
- for (int &M : Mask)
- if (lo <= M)
- M -= MaskWidth;
+
+ // Check for repeated inputs.
+ bool IsRepeat = false;
+ for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
+ if (UsedInputs[j] != Inputs[i])
+ continue;
+ for (int &M : Mask)
+ if (lo <= M)
+ M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
+ IsRepeat = true;
+ break;
+ }
+ if (IsRepeat)
+ continue;
+
+ UsedInputs.push_back(Inputs[i]);
}
Inputs = UsedInputs;
}
@@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
- if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
+ if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
return false;
resolveTargetShuffleInputsAndMask(Inputs, Mask);
@@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
Depth+1);
}
+ // Recurse into insert_subvector base/sub vector to find scalars.
+ if (Opcode == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Vec = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = N->getConstantOperandVal(2);
+
+ if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
+ return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+ }
+
+ // Recurse into extract_subvector src vector to find scalars.
+ if (Opcode == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ SDValue Src = N->getOperand(0);
+ uint64_t SrcIdx = N->getConstantOperandVal(1);
+ return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+ }
+
// Actual nodes that may contain scalar elements
if (Opcode == ISD::BITCAST) {
V = V.getOperand(0);
@@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
- // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
+ // dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
@@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
continue;
}
@@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
- bool First = true;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
- for (unsigned i = 0; i < 16; ++i) {
+ for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
- if (ThisIsNonZero && First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+ if (!ThisIsNonZero && !NextIsNonZero)
+ continue;
+
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
+ SDValue Elt;
+ if (ThisIsNonZero) {
+ if (NumZero || NextIsNonZero)
+ Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
- V = DAG.getUNDEF(MVT::v8i16);
- First = false;
+ Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
- if ((i & 1) != 0) {
- // FIXME: Investigate extending to i32 instead of just i16.
- // FIXME: Investigate combining the first 4 bytes as a i32 instead.
- SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
- if (LastIsNonZero) {
- LastElt =
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
- }
- if (ThisIsNonZero) {
- ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
- DAG.getConstant(8, dl, MVT::i8));
- if (LastIsNonZero)
- ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
- } else
- ThisElt = LastElt;
-
- if (ThisElt) {
- if (1 == i) {
- V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
- : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v8i16, V);
- } else {
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i / 2, dl));
- }
+ if (NextIsNonZero) {
+ SDValue NextElt = Op.getOperand(i + 1);
+ if (i == 0 && NumZero)
+ NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
+ else
+ NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
+ NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
+ DAG.getConstant(8, dl, MVT::i8));
+ if (ThisIsNonZero)
+ Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
+ else
+ Elt = NextElt;
+ }
+
+ // If our first insertion is not the first index then insert into zero
+ // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ if (!V) {
+ if (i != 0)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
}
}
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
+ DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
@@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
}
// Find all zeroable elements.
- std::bitset<4> Zeroable;
- for (int i=0; i < 4; ++i) {
- SDValue Elt = Op->getOperand(i);
+ std::bitset<4> Zeroable, Undefs;
+ for (int i = 0; i < 4; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
@@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
- for (unsigned i=0; i < 4; ++i) {
+ for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
continue;
- SDValue Elt = Op->getOperand(i);
+ SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Elt.getOperand(1)))
return SDValue();
@@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
- SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ SDValue VZeroOrUndef = (Zeroable == Undefs)
+ ? DAG.getUNDEF(VT)
+ : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
- return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
@@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
- CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
+ CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
}
if (!CanFold)
@@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
- SmallBitVector LoadMask(NumElems, false);
- SmallBitVector ZeroMask(NumElems, false);
- SmallBitVector UndefMask(NumElems, false);
+ APInt LoadMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt UndefMask = APInt::getNullValue(NumElems);
+
+ SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
+ continue;
+ }
+ if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
+ ZeroMask.setBit(i);
+ continue;
+ }
+
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ return SDValue();
- if (Elt.isUndef())
- UndefMask[i] = true;
- else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
- ZeroMask[i] = true;
- else if (ISD::isNON_EXTLoad(Elt.getNode())) {
- LoadMask[i] = true;
- LastLoadedElt = i;
- // Each loaded element must be the correct fractional portion of the
- // requested vector load.
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
- return SDValue();
- } else
+ if (!ISD::isNON_EXTLoad(Elt.getNode()))
return SDValue();
+
+ Loads[i] = cast<LoadSDNode>(Elt);
+ LoadMask.setBit(i);
+ LastLoadedElt = i;
}
- assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+ assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
+ LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
- if (UndefMask.count() == NumElems)
+ if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
- if ((ZeroMask | UndefMask).count() == NumElems)
+ if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- int FirstLoadedElt = LoadMask.find_first();
+ int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
- LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
- EVT LDBaseVT = EltBase.getValueType();
+ EVT EltBaseVT = EltBase.getValueType();
+ assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
+ "Register/Memory size mismatch");
+ LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
+ unsigned BaseSizeInBytes = BaseSizeInBits / 8;
+ int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
+ assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
@@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- SDValue Elt = peekThroughBitcasts(Elts[i]);
- LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.areNonVolatileConsecutiveLoads(
- LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
- i - FirstLoadedElt)) {
+ if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
+ i - FirstLoadedElt)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
@@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- SmallVector<LoadSDNode *, 8> Loads;
- for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
- if (LoadMask[i])
- Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
-
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
@@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, NewLd);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
- // LOAD - all consecutive load/undefs (must start/end with a load).
- // If we have found an entire vector of loads and undefs, then return a large
- // load of the entire vector width starting at the base pointer.
- // If the vector contains zeros, then attempt to shuffle those elements.
- if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ // Check if the base load is entirely dereferenceable.
+ bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
+ VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
+
+ // LOAD - all consecutive load/undefs (must start/end with a load or be
+ // entirely dereferenceable). If we have found an entire vector of loads and
+ // undefs, then return a large load of the entire vector width starting at the
+ // base pointer. If the vector contains zeros, then attempt to shuffle those
+ // elements.
+ if (FirstLoadedElt == 0 &&
+ (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
- assert(LDBase && "Did not find base load for merging consecutive loads");
- EVT EltVT = LDBase->getValueType(0);
- // Ensure that the input vector size for the merged loads matches the
- // cumulative size of the input elements.
- if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
- return SDValue();
-
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
@@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
- if (IsConsecutiveLoad)
+ if (NumElems == 1)
+ return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
+
+ if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
- if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+ if (!isAfterLegalize && VT.isVector()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
@@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- int LoadSize =
- (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+ // If the upper half of a ymm/zmm load is undef then just load the lower half.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned HalfNumElems = NumElems / 2;
+ if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
+ SDValue HalfLD =
+ EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
+ DAG, Subtarget, isAfterLegalize);
+ if (HalfLD)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ HalfLD, DAG.getIntPtrConstant(0, DL));
+ }
+ }
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
- (LoadSize == 32 || LoadSize == 64) &&
+ (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
- : MVT::getIntegerVT(LoadSize);
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
+ : MVT::getIntegerVT(LoadSizeInBits);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
LDBase->getAlignment(),
MachineMemOperand::MOLoad);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, ResNode);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
+ // BROADCAST - match the smallest possible repetition pattern, load that
+ // scalar/subvector element and then broadcast to the entire vector.
+ if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
+ (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
+ for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
+ unsigned RepeatSize = SubElems * BaseSizeInBits;
+ unsigned ScalarSize = std::min(RepeatSize, 64u);
+ if (!Subtarget.hasAVX2() && ScalarSize < 32)
+ continue;
+
+ bool Match = true;
+ SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
+ for (unsigned i = 0; i != NumElems && Match; ++i) {
+ if (!LoadMask[i])
+ continue;
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (RepeatedLoads[i % SubElems].isUndef())
+ RepeatedLoads[i % SubElems] = Elt;
+ else
+ Match &= (RepeatedLoads[i % SubElems] == Elt);
+ }
+
+ // We must have loads at both ends of the repetition.
+ Match &= !RepeatedLoads.front().isUndef();
+ Match &= !RepeatedLoads.back().isUndef();
+ if (!Match)
+ continue;
+
+ EVT RepeatVT =
+ VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
+ ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
+ : EVT::getFloatingPointVT(ScalarSize);
+ if (RepeatSize > ScalarSize)
+ RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
+ RepeatSize / ScalarSize);
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
+ VT.getSizeInBits() / ScalarSize);
+ if (TLI.isTypeLegal(BroadcastVT)) {
+ if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
+ unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
+ : X86ISD::VBROADCAST;
+ SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
+ return DAG.getBitcast(VT, Broadcast);
+ }
+ }
+ }
+ }
+
return SDValue();
}
+// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
+// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
+// are consecutive, non-overlapping, and in the right order.
+static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool isAfterLegalize) {
+ SmallVector<SDValue, 64> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ return SDValue();
+ }
+ assert(Elts.size() == VT.getVectorNumElements());
+ return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
+ isAfterLegalize);
+}
+
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
@@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
-static bool isUseOfShuffle(SDNode *N) {
+static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
- if (isTargetShuffle(U->getOpcode()))
+ unsigned Opc = U->getOpcode();
+ // VPERMV/VPERMV3 shuffles can never fold their index operands.
+ if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
+ return false;
+ if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
+ return false;
+ if (isTargetShuffle(Opc))
+ return true;
+ if (Opc == ISD::BITCAST) // Ignore bitcasts
+ return isFoldableUseOfShuffle(U);
+ if (N->hasOneUse())
return true;
- if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
- return isUseOfShuffle(U);
}
return false;
}
@@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
- if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (BV->getOperand(i).isUndef())
+ DemandedElts.clearBit(i);
+
+ // If we don't need the upper xmm, then perform as a xmm hop.
+ unsigned HalfNumElts = NumElts / 2;
+ if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
+ SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
+ return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+ }
+
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
@@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
- unsigned NumNonUndefs = 0;
- for (const SDValue &V : BV->op_values())
- if (!V.isUndef())
- ++NumNonUndefs;
-
+ unsigned NumNonUndefs =
+ count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
@@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
- unsigned HOpcode;
- SDValue V0, V1;
- if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())
+ if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
+ ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
+ unsigned HOpcode;
+ SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+ }
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
@@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ IsShift = true;
+ break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
@@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
+
+ // Extend shift amounts.
+ if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
+ if (!IsShift)
+ return SDValue();
+ RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
+ }
+
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
+
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
return Vec;
}
-// Return true if all the operands of the given CONCAT_VECTORS node are zeros
-// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
-static bool isExpandWithZeros(const SDValue &Op) {
- assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
- "Expand with zeros only possible in CONCAT_VECTORS nodes!");
-
- for (unsigned i = 1; i < Op.getNumOperands(); i++)
- if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
- return false;
-
- return true;
-}
-
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
-static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
- unsigned Opc = Op.getOpcode();
-
- assert(Opc == ISD::CONCAT_VECTORS &&
- Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
- "Unexpected node to check for type promotion!");
-
- // As long as we are concatenating zeros to the upper part of a previous node
- // result, climb up the tree until a node with different opcode is
- // encountered
- while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
- if (Opc == ISD::INSERT_SUBVECTOR) {
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
- Op.getConstantOperandVal(2) == 0)
- Op = Op.getOperand(1);
- else
- return SDValue();
- } else { // Opc == ISD::CONCAT_VECTORS
- if (isExpandWithZeros(Op))
- Op = Op.getOperand(0);
- else
- return SDValue();
- }
- Opc = Op.getOpcode();
- }
-
- // Check if the first inserted node zeroes the upper bits, or an 'and' result
- // of a node that zeros the upper bits (its masked version).
- if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
- (Op.getOpcode() == ISD::AND &&
- (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
- isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
- return Op;
- }
-
- return SDValue();
-}
-
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
@@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
- // If this node promotes - by concatenating zeroes - the type of the result
- // of a node with instruction that zeroes all upper (irrelevant) bits of the
- // output register, mark it as legal and catch the pattern in instruction
- // selection to avoid emitting extra instructions (for zeroing upper bits).
- if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
- return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
-
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
@@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
+ assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
+ "Illegal target shuffle mask");
for (int i = 0; i < Size; ++i)
if (Mask[i] == SM_SentinelUndef)
@@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
return IsUnpackwdMask;
}
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // We can't assume a canonical shuffle mask, so try the commuted version too.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(CommutedMask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if a shuffle mask chooses elements identically in its top and
+/// bottom halves. For example, any splat mask has the same top and bottom
+/// halves. If an element is undefined in only one half of the mask, the halves
+/// are not considered identical.
+static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
+ assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
+ unsigned HalfSize = Mask.size() / 2;
+ for (unsigned i = 0; i != HalfSize; ++i) {
+ if (Mask[i] != Mask[i + HalfSize])
+ return false;
+ }
+ return true;
+}
+
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable,
}
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
-static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
@@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
-static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
- const APInt &Zeroable,
- ArrayRef<int> Mask, SDValue &V1,
- SDValue &V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
@@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getSelect(DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
-static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
@@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
-static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
@@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
return false;
}
-static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
@@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
- assert(!VT.isFloatingPoint() && "Floating point types are not supported");
+static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
- SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ SDValue Zero, AllOnes;
+ // Use f64 if i64 isn't legal.
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ MaskVT = MVT::getVectorVT(EltVT, Mask.size());
+ }
+
+ MVT LogicVT = VT;
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ Zero = DAG.getConstantFP(0.0, DL, EltVT);
+ AllOnes = DAG.getConstantFP(
+ APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+ LogicVT =
+ MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
+ } else {
+ Zero = DAG.getConstant(0, DL, EltVT);
+ AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ }
+
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
if (!V)
return SDValue(); // No non-zeroable elements!
- SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
- return DAG.getNode(ISD::AND, DL, VT, V, VMask);
+ SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
+ VMask = DAG.getBitcast(LogicVT, VMask);
+ V = DAG.getBitcast(LogicVT, V);
+ SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
+ return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
@@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
-static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
@@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Original,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
uint64_t BlendMask = 0;
@@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
- case MVT::v2f64:
- case MVT::v4f32:
- case MVT::v4f64:
- case MVT::v8f32:
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
+ case MVT::v4f64:
+ case MVT::v8f32:
+ assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
+ LLVM_FALLTHROUGH;
+ case MVT::v2f64:
case MVT::v2i64:
+ case MVT::v4f32:
case MVT::v4i32:
- // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
- // that instruction.
- if (Subtarget.hasAVX2()) {
- // Scale the blend by the number of 32-bit dwords per element.
- int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
- V1 = DAG.getBitcast(BlendVT, V1);
- V2 = DAG.getBitcast(BlendVT, V2);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
- LLVM_FALLTHROUGH;
- case MVT::v8i16: {
- // For integer shuffles we need to expand the mask and cast the inputs to
- // v8i16s prior to blending.
- int Scale = 8 / VT.getVectorNumElements();
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- V1 = DAG.getBitcast(MVT::v8i16, V1);
- V2 = DAG.getBitcast(MVT::v8i16, V2);
- return DAG.getBitcast(VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
+ case MVT::v8i16:
+ assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
- assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
@@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
}
LLVM_FALLTHROUGH;
}
- case MVT::v16i8:
- case MVT::v32i8: {
- assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
- "256-bit byte-blends require AVX2 support!");
+ case MVT::v32i8:
+ assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v16i8: {
+ assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
@@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
+ // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!OptForSize) {
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Masked;
+ }
+
+ // Otherwise load an immediate into a GPR, cast to k-register, and use a
+ // masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
@@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
-static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- bool ImmBlends = false) {
+static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
///
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
-static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
@@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
-static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
@@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute(
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
+static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
@@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, VT, V1, V2, Mask, DAG, true))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG, true))
return BlendPerm;
- if (SDValue UnpackPerm =
- lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return UnpackPerm;
- if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
- if (SDValue BlendPerm =
- lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return BlendPerm;
}
@@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
@@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return -1;
@@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
- int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+ int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
@@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
return Rotation * Scale;
}
-static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
- int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
@@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
@@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
- int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+ int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
@@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
DAG.getConstant(Rotation, DL, MVT::i8));
}
+/// Try to lower a vector shuffle as a byte shift sequence.
+static SDValue lowerVectorShuffleAsByteShiftMask(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported");
+
+ // We need a shuffle that has zeros at one/both ends and a sequential
+ // shuffle from one source within.
+ unsigned ZeroLo = Zeroable.countTrailingOnes();
+ unsigned ZeroHi = Zeroable.countLeadingOnes();
+ if (!ZeroLo && !ZeroHi)
+ return SDValue();
+
+ unsigned NumElts = Mask.size();
+ unsigned Len = NumElts - (ZeroLo + ZeroHi);
+ if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
+ return SDValue();
+
+ unsigned Scale = VT.getScalarSizeInBits() / 8;
+ ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
+ if (!isUndefOrInRange(StubMask, 0, NumElts) &&
+ !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
+ return SDValue();
+
+ SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
+ Res = DAG.getBitcast(MVT::v16i8, Res);
+
+ // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
+ // inner sequential set of elements, possibly offset:
+ // 01234567 --> zzzzzz01 --> 1zzzzzzz
+ // 01234567 --> 4567zzzz --> zzzzz456
+ // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
+ if (ZeroLo == 0) {
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+ } else if (ZeroHi == 0) {
+ unsigned Shift = Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else if (!Subtarget.hasSSSE3()) {
+ // If we don't have PSHUFB then its worth avoiding an AND constant mask
+ // by performing 3 byte shifts. Shuffle combining can kick in above that.
+ // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Shift += Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else
+ return SDValue();
+
+ return DAG.getBitcast(VT, Res);
+}
+
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
-static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
- unsigned ScalarSizeInBits,
- ArrayRef<int> Mask, int MaskOffset,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget) {
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
return -1;
}
-static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
unsigned Opcode;
// Try to match shuffle against V1 shift.
- int ShiftAmt = matchVectorShuffleAsShift(
- ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
- ShiftAmt =
- matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
- Mask, Size, Zeroable, Subtarget);
+ ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
V = V2;
}
@@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
-static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx, const APInt &Zeroable) {
+static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
@@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
-static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx) {
+static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
@@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
}
/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, SelectionDAG &DAG) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
- if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
DAG.getConstant(BitLen, DL, MVT::i8),
@@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
-static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
@@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
+ // TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
@@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
+ InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFWMask[4] = {1, -1, -1, -1};
- unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+ unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
return DAG.getBitcast(
VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
@@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getConstant(EltBits, DL, MVT::i8),
DAG.getConstant(LoIdx, DL, MVT::i8)));
- if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
- !SafeOffset(Offset + 1))
+ if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
@@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
///
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
-static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (Offset != 0 && Matches < 2)
return SDValue();
- return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+ return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
+ InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
///
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
-static SDValue lowerVectorShuffleAsElementInsertion(
+static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
///
/// This assumes we have AVX2.
-static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
- SDValue V0, int BroadcastIdx,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
@@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
+/// Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// If we are extracting two 128-bit halves of a vector and shuffling the
+/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
+/// multi-shuffle lowering.
+static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
+ SDValue N1, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ EVT VT = N0.getValueType();
+ assert((VT.is128BitVector() &&
+ (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
+ "VPERM* family of shuffles requires 32-bit or 64-bit elements");
+
+ // Check that both sources are extracts of the same source vector.
+ if (!N0.hasOneUse() || !N1.hasOneUse() ||
+ N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N0.getOperand(0) != N1.getOperand(0))
+ return SDValue();
+
+ SDValue WideVec = N0.getOperand(0);
+ EVT WideVT = WideVec.getValueType();
+ if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
+ !isa<ConstantSDNode>(N1.getOperand(1)))
+ return SDValue();
+
+ // Match extracts of each half of the wide source vector. Commute the shuffle
+ // if the extract of the low half is N1.
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
+ const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
+ const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
+ if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
+ return SDValue();
+
+ // Final bailout: if the mask is simple, we are better off using an extract
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
+ return SDValue();
+
+ // Extend the shuffle mask with undef elements.
+ NewMask.append(NumElts, -1);
+
+ // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
+ SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
+ NewMask);
+ // This is free: ymm -> xmm.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
+ DAG.getIntPtrConstant(0, DL));
+}
+
/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
(Subtarget.hasAVX() && VT.isFloatingPoint()) ||
(Subtarget.hasAVX2() && VT.isInteger())))
@@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: X86ISD::VBROADCAST;
@@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
- // Peek through bitcasts as long as BroadcastIdx can be adjusted.
- SDValue VSrc = V.getOperand(0);
- unsigned NumEltBits = V.getScalarValueSizeInBits();
- unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
- if ((NumEltBits % NumSrcBits) == 0)
- BroadcastIdx *= (NumEltBits / NumSrcBits);
- else if ((NumSrcBits % NumEltBits) == 0 &&
- (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
- BroadcastIdx /= (NumSrcBits / NumEltBits);
- else
- break;
- V = VSrc;
+ V = V.getOperand(0);
continue;
}
case ISD::CONCAT_VECTORS: {
- int OperandSize =
- V.getOperand(0).getSimpleValueType().getVectorNumElements();
- V = V.getOperand(BroadcastIdx / OperandSize);
- BroadcastIdx %= OperandSize;
+ int OpBitWidth = V.getOperand(0).getValueSizeInBits();
+ int OpIdx = BitOffset / OpBitWidth;
+ V = V.getOperand(OpIdx);
+ BitOffset %= OpBitWidth;
continue;
}
case ISD::INSERT_SUBVECTOR: {
@@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
if (!ConstantIdx)
break;
- int BeginIdx = (int)ConstantIdx->getZExtValue();
- int EndIdx =
- BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
- if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
- BroadcastIdx -= BeginIdx;
+ int EltBitWidth = VOuter.getScalarValueSizeInBits();
+ int Idx = (int)ConstantIdx->getZExtValue();
+ int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
+ int BeginOffset = Idx * EltBitWidth;
+ int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
+ if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
+ BitOffset -= BeginOffset;
V = VInner;
} else {
V = VOuter;
@@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
}
break;
}
+ assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
+ BroadcastIdx = BitOffset / NumEltBits;
- // Ensure the source vector and BroadcastIdx are for a suitable type.
- if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
- unsigned NumEltBits = VT.getScalarSizeInBits();
- unsigned NumSrcBits = V.getScalarValueSizeInBits();
- if ((NumSrcBits % NumEltBits) == 0)
- BroadcastIdx *= (NumSrcBits / NumEltBits);
- else if ((NumEltBits % NumSrcBits) == 0 &&
- (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
- BroadcastIdx /= (NumEltBits / NumSrcBits);
- else
- return SDValue();
-
- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
- MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
- V = DAG.getBitcast(SrcVT, V);
- }
+ // Do we need to bitcast the source to retrieve the original broadcast index?
+ bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
- // First, look through bitcast: if the original value has a larger element
- // type than the shuffle, the broadcast element is in essence truncated.
- // Make that explicit to ease folding.
- if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
- if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
- DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ // If the original value has a larger element type than the shuffle, the
+ // broadcast element is in essence truncated. Make that explicit to ease
+ // folding.
+ if (BitCastSrc && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
+ DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
MVT BroadcastVT = VT;
- // Peek through any bitcast (only useful for loads).
- SDValue BC = peekThroughBitcasts(V);
-
// Also check the simpler case, where we can directly reuse the scalar.
- if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
- (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ if (!BitCastSrc &&
+ ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
V = V.getOperand(BroadcastIdx);
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// If we are broadcasting a load that is only used by the shuffle
// then we can reduce the vector load to the broadcasted scalar load.
- LoadSDNode *Ld = cast<LoadSDNode>(BC);
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
EVT SVT = BroadcastVT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
@@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
- } else if (BroadcastIdx != 0) {
+ } else if (BitOffset != 0) {
// We can only broadcast from the zero-element of a vector register,
// but it can be advantageous to broadcast from the zero-element of a
// subvector.
@@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
return SDValue();
// Only broadcast the zero-element of a 128-bit subvector.
- unsigned EltSize = VT.getScalarSizeInBits();
- if (((BroadcastIdx * EltSize) % 128) != 0)
+ if ((BitOffset % 128) != 0)
return SDValue();
- // The shuffle input might have been a bitcast we looked through; look at
- // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
- // later bitcast it to BroadcastVT.
- assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
- "Unexpected vector element size");
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
- V = extract128BitVector(V, BroadcastIdx, DAG, DL);
+ unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
+ V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
@@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
DAG.getBitcast(MVT::f64, V));
// Bitcast back to the same scalar type as BroadcastVT.
- MVT SrcVT = V.getSimpleValueType();
- if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
- assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
+ assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
"Unexpected vector element size");
- if (SrcVT.isVector()) {
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+ MVT ExtVT;
+ if (V.getValueType().isVector()) {
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
} else {
- SrcVT = BroadcastVT.getScalarType();
+ ExtVT = BroadcastVT.getScalarType();
}
- V = DAG.getBitcast(SrcVT, V);
+ V = DAG.getBitcast(ExtVT, V);
}
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+ if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
V = DAG.getBitcast(MVT::f64, V);
unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
@@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
- if (SrcVT.getSizeInBits() > 128) {
- MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
- 128 / SrcVT.getScalarSizeInBits());
+ if (V.getValueSizeInBits() > 128) {
+ MVT ExtVT = V.getSimpleValueType().getScalarType();
+ ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
V = DAG.getBitcast(ExtVT, V);
}
@@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
- unsigned &InsertPSMask,
- const APInt &Zeroable,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
- if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
// Insert the V2 element into the desired position.
@@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsPermuteAndUnpack(
+static SDValue lowerShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
@@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
-static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
@@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
@@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
-static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
-/// Test whether this can be lowered with a single SHUFPS instruction.
-///
-/// This is used to disable more specialized lowerings when the shufps lowering
-/// will happen to be efficient.
-static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
- // This routine only handles 128-bit shufps.
- assert(Mask.size() == 4 && "Unsupported mask size!");
- assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
- assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
- assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
- assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
-
- // To lower with a single SHUFPS we need to have the low half and high half
- // each requiring a single input.
- if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
- return false;
- if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
- return false;
-
- return true;
-}
-
/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
-static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
@@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
-static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
@@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
- if (SDValue V =
- lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, MVT::v4f32, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
+ V2, Mask, DAG))
return BlendPerm;
}
@@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;
// Otherwise fall back to a SHUFPS lowering strategy.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
/// Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
-static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
@@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
}
@@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
-static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+static SDValue lowerV8I16GeneralSingleInputShuffle(
const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
@@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
- int NumLToL =
- std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
- int NumLToH =
- std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
int NumHToH = HiInputs.size() - NumLToH;
MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
@@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
- int ADWord, BDWord;
+ int ADWord = 0, BDWord = 0;
int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
@@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
-static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+static SDValue lowerShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
@@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
-static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
- DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
+ Subtarget, DAG))
return Rotate;
// Make a copy of the mask so it can be modified.
SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
- MutableMask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
+ Subtarget, DAG);
}
assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
@@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
"shuffles.");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG, V1InUse, V2InUse);
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG);
}
/// Check whether a compaction lowering can be done by dropping even
@@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
return 0;
}
-static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
@@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
-static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use a zext lowering.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
@@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
}
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V1InUse = false;
bool V2InUse = false;
- SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+ SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
@@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
@@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
- if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return V;
}
@@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
- if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
- return BitBlend;
+ if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Blend;
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
@@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Handle multi-input cases by blending single-input shuffles.
if (NumV2Elements > 0)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
- return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
- return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
- return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
- return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
- return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
- return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
@@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
-static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
-static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.");
int Size = Mask.size();
@@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
return true;
};
if (DoBothBroadcast())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
/// This is mainly for cases where we can have non-repeating permutes
/// in each lane.
///
-/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
/// we should investigate merging them.
-static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+static SDValue lowerShuffleAsLanePermuteAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
@@ -13884,7 +14401,6 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
int NumEltsPerLane = NumElts / NumLanes;
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
- SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
@@ -13899,10 +14415,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
return SDValue();
SrcLaneMask[DstLane] = SrcLane;
- LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
}
+ // Make sure we set all elements of the lane mask, to avoid undef propagation.
+ SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+ for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
+ int SrcLane = SrcLaneMask[DstLane];
+ if (0 <= SrcLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ LaneMask[(DstLane * NumEltsPerLane) + j] =
+ (SrcLane * NumEltsPerLane) + j;
+ }
+ }
+
// If we're only shuffling a single lowest lane and the rest are identity
// then don't bother.
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
@@ -13931,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.
-static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleAsLanePermuteAndBlend(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
@@ -13950,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] / LaneSize)] = true;
if (!LaneUsed[0] || !LaneUsed[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
assert(V2.isUndef() &&
@@ -13981,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
}
/// Handle lowering 2-lane 128-bit shuffles.
-static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
@@ -14012,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -14084,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
-///
-/// FIXME: This should be generalized to 512-bit shuffles.
-static SDValue lowerVectorShuffleByMerging128BitLanes(
+static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
@@ -14095,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return SDValue();
int Size = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
int LaneSize = 128 / VT.getScalarSizeInBits();
- int NumLanes = Size / LaneSize;
- assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
-
SmallVector<int, 16> RepeatMask(LaneSize, -1);
- int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
+ SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
@@ -14111,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
- // Determine which of the 4 possible input lanes (2 from each source)
+ // Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
@@ -14250,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
-/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
-/// This allows for fast cases such as subvector extraction/insertion
-/// or shuffling smaller vector types which can lower more efficiently.
-static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert((VT.is256BitVector() || VT.is512BitVector()) &&
- "Expected 256-bit or 512-bit vector");
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
-
- bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
- bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
- if (!UndefLower && !UndefUpper)
- return SDValue();
-
- // Upper half is undef and lower half is whole upper subvector.
- // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (UndefUpper &&
- isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Lower half is undef and upper half is whole lower subvector.
- // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (UndefLower &&
- isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- }
+/// If the input shuffle mask results in a vector that is undefined in all upper
+/// or lower half elements and that mask accesses only 2 halves of the
+/// shuffle's operands, return true. A mask of half the width with mask indexes
+/// adjusted to access the extracted halves of the original shuffle operands is
+/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
+/// lower half of each input operand is accessed.
+static bool
+getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
+ int &HalfIdx1, int &HalfIdx2) {
+ assert((Mask.size() == HalfMask.size() * 2) &&
+ "Expected input mask to be twice as long as output");
+
+ // Exactly one half of the result must be undef to allow narrowing.
+ bool UndefLower = isUndefLowerHalf(Mask);
+ bool UndefUpper = isUndefUpperHalf(Mask);
+ if (UndefLower == UndefUpper)
+ return false;
- // If the shuffle only uses two of the four halves of the input operands,
- // then extract them and perform the 'half' shuffle at half width.
- // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
- int HalfIdx1 = -1, HalfIdx2 = -1;
- SmallVector<int, 8> HalfMask(HalfNumElts);
- unsigned Offset = UndefLower ? HalfNumElts : 0;
+ unsigned HalfNumElts = HalfMask.size();
+ unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
+ HalfIdx1 = -1;
+ HalfIdx2 = -1;
for (unsigned i = 0; i != HalfNumElts; ++i) {
- int M = Mask[i + Offset];
+ int M = Mask[i + MaskIndexOffset];
if (M < 0) {
HalfMask[i] = M;
continue;
@@ -14324,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
}
// Too many half vectors referenced.
- return SDValue();
+ return false;
}
- assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
-
- // Only shuffle the halves of the inputs when useful.
- int NumLowerHalves =
- (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
- int NumUpperHalves =
- (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
-
- // uuuuXXXX - don't extract uppers just to insert again.
- if (UndefLower && NumUpperHalves != 0)
- return SDValue();
- // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
- if (UndefUpper && NumUpperHalves == 2)
- return SDValue();
+ return true;
+}
- // AVX2 - XXXXuuuu - always extract lowers.
- if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
- // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
- if (VT == MVT::v4f64 || VT == MVT::v4i64)
- return SDValue();
- // AVX2 supports variable 32-bit element cross-lane shuffles.
- if (VT == MVT::v8f32 || VT == MVT::v8i32) {
- // XXXXuuuu - don't extract lowers and uppers.
- if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
- return SDValue();
- }
- }
+/// Given the output values from getHalfShuffleMask(), create a half width
+/// shuffle of extracted vectors followed by an insert back to full width.
+static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> HalfMask, int HalfIdx1,
+ int HalfIdx2, bool UndefLower,
+ SelectionDAG &DAG) {
+ assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
+ assert(V1.getValueType().isSimple() && "Expecting only simple types");
- // AVX512 - XXXXuuuu - always extract lowers.
- if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
- return SDValue();
+ MVT VT = V1.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
- auto GetHalfVector = [&](int HalfIdx) {
+ auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
SDValue V = (HalfIdx < 2 ? V1 : V2);
@@ -14368,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
DAG.getIntPtrConstant(HalfIdx, DL));
};
- SDValue Half1 = GetHalfVector(HalfIdx1);
- SDValue Half2 = GetHalfVector(HalfIdx2);
+ // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
+ SDValue Half1 = getHalfVector(HalfIdx1);
+ SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
}
+/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected 256-bit or 512-bit vector");
+
+ bool UndefLower = isUndefLowerHalf(Mask);
+ if (!UndefLower && !isUndefUpperHalf(Mask))
+ return SDValue();
+
+ assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
+ "Completely undef shuffle mask should have been simplified already");
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ if (!UndefLower &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
+ return SDValue();
+
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ // Only shuffle the halves of the inputs when useful.
+ unsigned NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ unsigned NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+ assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
+
+ // Determine the larger pattern of undef/halves, then decide if it's worth
+ // splitting the shuffle based on subtarget capabilities and types.
+ unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+ if (!UndefLower) {
+ // XXXXuuuu: no insert is needed.
+ // Always extract lowers when setting lower - these are all free subreg ops.
+ if (NumUpperHalves == 0)
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+
+ if (NumUpperHalves == 1) {
+ // AVX2 has efficient 32/64-bit element cross-lane shuffles.
+ if (Subtarget.hasAVX2()) {
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
+ return SDValue();
+ // If this is a unary shuffle (assume that the 2nd operand is
+ // canonicalized to undef), then we can use vpermpd. Otherwise, we
+ // are better off extracting the upper half of 1 operand and using a
+ // narrow shuffle.
+ if (EltWidth == 64 && V2.isUndef())
+ return SDValue();
+ }
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Extract + narrow shuffle is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // Don't extract both uppers, instead shuffle and then extract.
+ assert(NumUpperHalves == 2 && "Half vector count went wrong");
+ return SDValue();
+ }
+
+ // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
+ if (NumUpperHalves == 0) {
+ // AVX2 has efficient 64-bit element cross-lane shuffles.
+ // TODO: Refine to account for unary shuffle, splat, and other masks?
+ if (Subtarget.hasAVX2() && EltWidth == 64)
+ return SDValue();
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Narrow shuffle + insert is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
+ return SDValue();
+}
+
/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
@@ -14560,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
SubLaneMask);
}
-static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &ShuffleImm,
- ArrayRef<int> Mask) {
+static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &ShuffleImm, ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
@@ -14597,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
- if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
@@ -14615,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
@@ -14659,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
- DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
+ Mask, DAG, Subtarget))
return V;
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
+ Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
return Op;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14694,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
- return Result;
+ return V;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 4-lane 64-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
-static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
if (V2.isUndef()) {
@@ -14763,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14800,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!isShuffleMaskInputInPlace(0, Mask) &&
!isShuffleMaskInputInPlace(1, Mask))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit floating point shuffles.
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
@@ -14849,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
}
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -14875,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
-static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -14926,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
@@ -14935,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
@@ -14961,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15006,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v8i32, ShufPS);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 16-lane 16-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
-static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15039,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15082,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
@@ -15095,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 32-lane 8-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
-static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15141,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15183,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
+ Subtarget);
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG);
}
/// High-level routine to lower various 256-bit x86 vector shuffles.
@@ -15220,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
@@ -15251,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
- if (SDValue V =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -15268,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
switch (VT.SimpleTy) {
case MVT::v4f64:
- return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
- return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
- return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
- return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -15286,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
/// Try to lower a vector shuffle as a 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
@@ -15388,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15419,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
- Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15471,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
- return Unpck;
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Otherwise, fall back to a SHUFPS sequence.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
// If we have a single input shuffle with different shuffle patterns in the
@@ -15492,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15530,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
- V1, V2, Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15578,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
@@ -15595,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
@@ -15621,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
-static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15650,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (V2.isUndef()) {
@@ -15675,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
-static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
@@ -15705,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -15743,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (!V2.isUndef())
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -15756,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
@@ -15770,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have support for
@@ -15790,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
- return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
- return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
@@ -15809,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
-// version of matchVectorShuffleAsShift.
+// version of matchShuffleAsShift.
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable) {
int Size = Mask.size();
@@ -15844,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
-static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
@@ -16037,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// Check for non-undef masks pointing at an undef vector and make the masks
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
- if (V2IsUndef)
- for (int M : Mask)
- if (M >= NumElements) {
- SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
- for (int &M : NewMask)
- if (M >= NumElements)
- M = -1;
- return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
- }
+ if (V2IsUndef &&
+ any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ }
// Check for illegal shuffle mask element index values.
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
@@ -16083,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
@@ -16122,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
- if (SDValue V =
- lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is256BitVector())
- return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is512BitVector())
- return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (Is1BitVector)
- return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
llvm_unreachable("Unimplemented!");
}
@@ -16401,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// this can be done with a mask.
IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- DAG.getConstant(IdxVal, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxVal, dl));
}
assert(VecVT.is128BitVector() && "Unexpected vector length");
@@ -16527,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- if (!isa<ConstantSDNode>(N2))
+
+ auto *N2C = dyn_cast<ConstantSDNode>(N2);
+ if (!N2C || N2C->getAPIntValue().uge(NumElts))
return SDValue();
- auto *N2C = cast<ConstantSDNode>(N2);
- unsigned IdxVal = N2C->getZExtValue();
+ uint64_t IdxVal = N2C->getZExtValue();
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
@@ -16575,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(IdxIn128, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxIn128, dl));
// Insert the changed part back into the bigger vector
return insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+ // This will be just movd/movq/movss/movsd.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
+ (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64)) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
@@ -16613,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
@@ -16663,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
+ "Expected an SSE type!");
// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
if (OpVT == MVT::v4i32)
@@ -16789,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue
-X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
- const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-
- // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
- // global base reg.
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
-
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
-
- SDLoc DL(Op);
- Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
-
- // With PIC, the address is actually $g + Offset.
- if (OpFlag) {
- Result =
- DAG.getNode(ISD::ADD, DL, PtrVT,
- DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
- }
-
- // For symbols that require a load from a stub to get the address, emit the
- // load.
- if (isGlobalStubReference(OpFlag))
- Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
-
- return Result;
+SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
SDValue
@@ -16841,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
- const SDLoc &dl, int64_t Offset,
- SelectionDAG &DAG) const {
- // Create the TargetGlobalAddress node, folding in the constant
- // offset if it is legal.
- unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+/// Creates target global address or external symbol nodes for calls or
+/// other uses.
+SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const {
+ // Unpack the global address or external symbol.
+ const SDLoc &dl = SDLoc(Op);
+ const GlobalValue *GV = nullptr;
+ int64_t Offset = 0;
+ const char *ExternalSym = nullptr;
+ if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
+ GV = G->getGlobal();
+ Offset = G->getOffset();
+ } else {
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ ExternalSym = ES->getSymbol();
+ }
+
+ // Calculate some flags for address lowering.
+ const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlags;
+ if (ForCall)
+ OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
+ else
+ OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
+ bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
+ bool NeedsLoad = isGlobalStubReference(OpFlags);
+
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
- if (OpFlags == X86II::MO_NO_FLAG &&
- X86::isOffsetSuitableForCodeModel(Offset, M)) {
- // A direct static reference to a global.
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
- Offset = 0;
+
+ if (GV) {
+ // Create a target global address if this is a global. If possible, fold the
+ // offset into the global address reference. Otherwise, ADD it on later.
+ int64_t GlobalOffset = 0;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ std::swap(GlobalOffset, Offset);
+ }
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
} else {
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+ // If this is not a global address, this must be an external symbol.
+ Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
}
+ // If this is a direct call, avoid the wrapper if we don't need to do any
+ // loads or adds. This allows SDAG ISel to match direct calls.
+ if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
+ return Result;
+
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
- if (isGlobalRelativeToPICBase(OpFlags)) {
+ if (HasPICReg) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
}
// For globals that require a load from a stub to get the address, emit the
// load.
- if (isGlobalStubReference(OpFlags))
+ if (NeedsLoad)
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
@@ -16884,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
- return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
static SDValue
@@ -17112,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium() ||
- Subtarget.isTargetWindowsGNU()) {
+ if (Subtarget.isOSWindows()) {
// Just use the implicit TLS architecture
// Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -17254,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
APInt APIntShiftAmt;
if (isConstantSplat(Amt, APIntShiftAmt)) {
- uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
+ uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
}
@@ -17267,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
@@ -17311,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Extract.getOperand(1)))
+ return SDValue();
+
+ // See if we have a 128-bit vector cast op for this type of cast.
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
+ MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
+ MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
+ if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
+ return SDValue();
+
+ // If we are extracting from a non-zero element, first shuffle the source
+ // vector to allow extracting from element zero.
+ SDLoc DL(Cast);
+ if (!isNullConstant(Extract.getOperand(1))) {
+ SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
+ Mask[0] = Extract.getConstantOperandVal(1);
+ VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
+ }
+ // If the source vector is wider than 128-bits, extract the low part. Do not
+ // create an unnecessarily wide vector cast op.
+ if (FromVT != Vec128VT)
+ VecOp = extract128BitVector(VecOp, 0, DAG, DL);
+
+ // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
+ // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ DAG.getIntPtrConstant(0, DL));
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
@@ -17318,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
@@ -17371,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
else
Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
- unsigned ByteSize = SrcVT.getSizeInBits()/8;
+ unsigned ByteSize = SrcVT.getSizeInBits() / 8;
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
- MachineMemOperand *MMO;
+ MachineMemOperand *LoadMMO;
if (FI) {
int SSFI = FI->getIndex();
- MMO = DAG.getMachineFunction().getMachineMemOperand(
+ LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOLoad, ByteSize, ByteSize);
} else {
- MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
StackSlot = StackSlot.getOperand(1);
}
- SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
- SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
- X86ISD::FILD, DL,
- Tys, Ops, SrcVT, MMO);
+ SDValue FILDOps[] = {Chain, StackSlot};
+ SDValue Result =
+ DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
+ Tys, FILDOps, SrcVT, LoadMMO);
if (useSSE) {
Chain = Result.getValue(1);
@@ -17397,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// shouldn't be necessary except that RFP cannot be live across
// multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueSizeInBits()/8;
+ unsigned SSFISize = Op.getValueSizeInBits() / 8;
int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
- SDValue Ops[] = {
- Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
- };
- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOStore, SSFISize, SSFISize);
- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, Op.getValueType(), MMO);
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
+ Op.getValueType(), StoreMMO);
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
@@ -17545,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
// Two to the power of half-word-size.
- SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+ SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
// Clear upper part of LO, lower HI.
SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
@@ -17680,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
@@ -17732,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MachineMemOperand::MOLoad, 8, 8);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
@@ -17768,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
-// just return an <SDValue(), SDValue()> pair.
+// just return an SDValue().
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
-// to i16, i32 or i64, and we lower it to a legal sequence.
-// If lowered to the final integer result we return a <result, SDValue()> pair.
-// Otherwise we lower it to a sequence ending with a FIST, return a
-// <FIST, StackSlot> pair, and the caller is responsible for loading
-// the final integer result from StackSlot.
-std::pair<SDValue,SDValue>
+// to i16, i32 or i64, and we lower it to a legal sequence and return the
+// result.
+SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned, bool IsReplace) const {
+ bool IsSigned) const {
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
@@ -17787,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
- return std::make_pair(SDValue(), SDValue());
+ return SDValue();
}
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
- bool UnsignedFixup = !IsSigned &&
- DstTy == MVT::i64 &&
- (!Subtarget.is64Bit() ||
- !isScalarFPTypeInSSEReg(TheVT));
+ bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
- if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
+ if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -17809,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
- // These are really Legal.
- if (DstTy == MVT::i32 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
- if (Subtarget.is64Bit() &&
- DstTy == MVT::i64 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
-
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned MemSize = DstTy.getSizeInBits()/8;
+ unsigned MemSize = DstTy.getStoreSize();
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- unsigned Opc;
- switch (DstTy.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
- case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
- case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
- case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
- }
-
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
@@ -17874,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
- Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0x80000000, DL, MVT::i32));
+ Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
+ DAG.getConstant(0, DL, MVT::i64),
+ DAG.getConstant(APInt::getSignMask(64),
+ DL, MVT::i64));
SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
@@ -17884,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
}
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
+
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
- Chain = DAG.getStore(Chain, DL, Value, StackSlot,
- MachinePointerInfo::getFixedStack(MF, SSFI));
- SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
- SDValue Ops[] = {
- Chain, StackSlot, DAG.getValueType(TheVT)
- };
-
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
+ SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+ SDValue Ops[] = { Chain, StackSlot };
+
+ unsigned FLDSize = TheVT.getStoreSize();
+ assert(FLDSize <= MemSize && "Stack slot not big enough");
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
- SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
- StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
}
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOStore, MemSize, MemSize);
-
- if (UnsignedFixup) {
-
- // Insert the FIST, load its result as two i32's,
- // and XOR the high i32 with Adjust.
+ // Build the FP_TO_INT*_IN_MEM
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
- SDValue FistOps[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- FistOps, DstTy, MMO);
+ SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
- SDValue Low32 =
- DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
- SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
+ // If we need an unsigned fixup, XOR the result with adjust.
+ if (UnsignedFixup)
+ Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
- SDValue High32 =
- DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
- High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
-
- if (Subtarget.is64Bit()) {
- // Join High32 and Low32 into a 64-bit result.
- // (High32 << 32) | Low32
- Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
- High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
- High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
- DAG.getConstant(32, DL, MVT::i8));
- SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
- return std::make_pair(Result, SDValue());
- }
-
- SDValue ResultOps[] = { Low32, High32 };
-
- SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
- : DAG.getMergeValues(ResultOps, DL);
- return std::make_pair(pair, SDValue());
- } else {
- // Build the FP_TO_INT*_IN_MEM
- SDValue Ops[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, DstTy, MMO);
- return std::make_pair(FIST, StackSlot);
- }
+ return Res;
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- MVT VT = Op->getSimpleValueType(0);
- SDValue In = Op->getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
+ "Unexpected extension opcode");
assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
@@ -17970,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
+ unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
+
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
@@ -17977,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
- return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
+ return DAG.getNode(ExtendInVecOpc, dl, VT, In);
}
if (Subtarget.hasInt256())
@@ -18000,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
- SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
+ SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
+
+ // Short-circuit if we can determine that each 128-bit half is the same value.
+ // Otherwise, this is difficult to match and optimize.
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
+ if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
SDValue Undef = DAG.getUNDEF(InVT);
- bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ bool NeedZero = Opc == ISD::ZERO_EXTEND;
SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
OpHi = DAG.getBitcast(HalfVT, OpHi);
@@ -18179,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
- Res = DAG.getBitcast(MVT::v4i64, Res);
- Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+ // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
+ SmallVector<int, 64> Mask;
+ int Scale = 64 / OutVT.getScalarSizeInBits();
+ scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+ Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
return DAG.getBitcast(DstVT, Res);
@@ -18422,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
MVT VT = Op.getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ SDLoc dl(Op);
if (VT.isVector()) {
- SDValue Src = Op.getOperand(0);
- SDLoc dl(Op);
-
- if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
+ if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -18447,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
- if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+ if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
@@ -18458,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
assert(!VT.isVector());
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
- IsSigned, /*IsReplace=*/ false);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (!FIST.getNode())
+ bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
+
+ if (!IsSigned && Subtarget.hasAVX512()) {
+ // Conversions from f32/f64 should be legal.
+ if (UseSSEReg)
+ return Op;
+
+ // Use default expansion.
+ if (VT == MVT::i64)
+ return SDValue();
+ }
+
+ // Promote i16 to i32 if we can use a SSE operation.
+ if (VT == MVT::i16 && UseSSEReg) {
+ assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ // If this is a SINT_TO_FP using SSEReg we're done.
+ if (UseSSEReg && IsSigned)
return Op;
- if (StackSlot.getNode())
- // Load the result.
- return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
+ // Fall back to X87.
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
+ return V;
- // The node is the result.
- return FIST;
+ llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
@@ -18491,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
@@ -18513,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
if (!IsFP && !Subtarget.hasSSSE3())
return Op;
- // Defer forming the minimal horizontal op if the vector source has more than
- // the 2 extract element uses that we're matching here. In that case, we might
- // form a horizontal op that includes more than 1 add/sub op.
+ // Extract from a common vector.
if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
LHS.getOperand(0) != RHS.getOperand(0) ||
- !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
- return Op;
-
- if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
+ !isa<ConstantSDNode>(LHS.getOperand(1)) ||
!isa<ConstantSDNode>(RHS.getOperand(1)) ||
!shouldUseHorizontalOp(true, DAG, Subtarget))
return Op;
@@ -18540,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
}
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
- if (LExtIndex == 1 && RExtIndex == 0 &&
+ if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
std::swap(LExtIndex, RExtIndex);
- // TODO: This can be extended to handle other adjacent extract pairs.
- if (LExtIndex != 0 || RExtIndex != 1)
+ if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
return Op;
SDValue X = LHS.getOperand(0);
EVT VecVT = X.getValueType();
unsigned BitWidth = VecVT.getSizeInBits();
+ unsigned NumLanes = BitWidth / 128;
+ unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here");
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
- // equivalent, so extract the 256/512-bit source op to 128-bit.
- // This is free: ymm/zmm -> xmm.
+ // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+ // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
- DAG.getIntPtrConstant(0, DL));
+ DAG.getIntPtrConstant(LExtIndex / 2, DL));
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -18732,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
}
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG,
- SDValue &X86CC) {
- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
-
- if (!Subtarget.hasSSE41())
- return SDValue();
-
- if (!Op->hasOneUse())
- return SDValue();
-
- SDNode *N = Op.getNode();
- SDLoc DL(N);
-
+/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
+/// style scalarized (associative) reduction patterns.
+static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
- DenseMap<SDValue, unsigned> VecInMap;
- SmallVector<SDValue, 8> VecIns;
+ DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
- Opnds.push_back(N->getOperand(0));
- Opnds.push_back(N->getOperand(1));
+ assert(Op.getOpcode() == unsigned(BinOp) &&
+ "Unexpected bit reduction opcode");
+ Opnds.push_back(Op.getOperand(0));
+ Opnds.push_back(Op.getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
- // BFS traverse all OR'd operands.
- if (I->getOpcode() == ISD::OR) {
+ // BFS traverse all BinOp operands.
+ if (I->getOpcode() == unsigned(BinOp)) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
@@ -18771,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
+ return false;
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ return false;
- SDValue ExtractedFromVec = I->getOperand(0);
- DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
- if (M == VecInMap.end()) {
- VT = ExtractedFromVec.getValueType();
- // Quit if not 128/256-bit vector.
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
+ SDValue Src = I->getOperand(0);
+ DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
+ if (M == SrcOpMap.end()) {
+ VT = Src.getValueType();
// Quit if not the same type.
- if (VecInMap.begin() != VecInMap.end() &&
- VT != VecInMap.begin()->first.getValueType())
- return SDValue();
- M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
- VecIns.push_back(ExtractedFromVec);
+ if (SrcOpMap.begin() != SrcOpMap.end() &&
+ VT != SrcOpMap.begin()->first.getValueType())
+ return false;
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt EltCount = APInt::getNullValue(NumElts);
+ M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
+ SrcOps.push_back(Src);
}
- M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ // Quit if element already used.
+ unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (M->second[CIdx])
+ return false;
+ M->second.setBit(CIdx);
}
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Not extracted from 128-/256-bit vector.");
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
- unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+ return true;
+}
- for (DenseMap<SDValue, unsigned>::const_iterator
- I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
- // Quit if not all elements are used.
- if (I->second != FullMask)
- return SDValue();
- }
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, SDValue &X86CC) {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ return SDValue();
+
+ SmallVector<SDValue, 8> VecIns;
+ if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+ return SDValue();
+ // Quit if not 128/256-bit vector.
+ EVT VT = VecIns[0].getValueType();
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ SDLoc DL(Op);
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
@@ -18822,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
- DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
- VecIns.back(), VecIns.back());
+ X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
+ MVT::i8);
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -18963,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
- if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
- Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Only promote the compare up to I32 if it is a 16 bit operation
- // with an immediate. 16 bit immediates are to be avoided.
- if (Op0.getValueType() == MVT::i16 &&
- ((isa<ConstantSDNode>(Op0) &&
- !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) ||
- (isa<ConstantSDNode>(Op1) &&
- !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) &&
- !DAG.getMachineFunction().getFunction().optForMinSize() &&
- !Subtarget.isAtom()) {
+ EVT CmpVT = Op0.getValueType();
+
+ if (CmpVT.isFloatingPoint())
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+
+ assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
+ CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+ !DAG.getMachineFunction().getFunction().hasMinSize()) {
+ ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+ ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+ // Don't do this if the immediate can fit in 8-bits.
+ if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
+ (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
- Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
- Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
+ // For equality comparisons try to use SIGN_EXTEND if the input was
+ // truncate from something with enough sign bits.
+ if (Op0.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op0.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ } else if (Op1.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op1.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ }
+ }
+
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
- // Use SUB instead of CMP to enable CSE between SUB and CMP.
- SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return SDValue(Sub.getNode(), 1);
}
- assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
+ return Sub.getValue(1);
}
/// Convert a comparison if required by the subtarget.
@@ -19146,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
@@ -19290,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
-/// Given a simple buildvector constant, return a new vector constant with each
-/// element decremented. If decrementing would result in underflow or this
-/// is not a simple vector constant, return an empty value.
-static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+/// Given a buildvector constant, return a new vector constant with each element
+/// incremented or decremented. If incrementing or decrementing would result in
+/// unsigned overflow or underflow or this is not a simple vector constant,
+/// return an empty value.
+static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
@@ -19308,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
- // Avoid underflow.
- if (Elt->getAPIntValue().isNullValue())
+ // Avoid overflow/underflow.
+ const APInt &EltC = Elt->getAPIntValue();
+ if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
- NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
+ NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
}
return DAG.getBuildVector(VT, DL, NewVecC);
@@ -19344,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
- SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
break;
}
+ case ISD::SETUGT: {
+ // If the comparison is against a constant, we can turn this into a setuge.
+ // This is beneficial because materializing a constant 0 for the PCMPEQ is
+ // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+ // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+ if (!UGEOp1)
+ return SDValue();
+ Op1 = Op0;
+ Op0 = UGEOp1;
+ break;
+ }
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);
@@ -19446,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
- // Break 256-bit integer vector compare into smaller ones.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntVSETCC(Op, DAG);
-
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
@@ -19503,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
+ if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
+ Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
+ ConstantSDNode *C1 = isConstOrConstSplat(Op1);
+ if (C1 && C1->getAPIntValue().isPowerOf2()) {
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
+
+ SDValue Result = Op0.getOperand(0);
+ Result = DAG.getNode(ISD::SHL, dl, VT, Result,
+ DAG.getConstant(ShiftAmt, dl, VT));
+ Result = DAG.getNode(ISD::SRA, dl, VT, Result,
+ DAG.getConstant(BitWidth - 1, dl, VT));
+ return Result;
+ }
+ }
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
@@ -19530,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
- // TODO: This could be extended to handle a non-splat constant by checking
- // that each element of the constant is not the max/null value.
- APInt C;
- if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
+ if (Cond == ISD::SETUGT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isMaxValue();
+ })) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
- Op1 = DAG.getConstant(C + 1, dl, VT);
+ Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETUGE;
}
- if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
+ if (Cond == ISD::SETULT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isNullValue();
+ })) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
- Op1 = DAG.getConstant(C - 1, dl, VT);
+ Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETULE;
}
bool Invert = false;
@@ -19826,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
break;
case ISD::UADDO:
BaseOp = X86ISD::ADD;
- Cond = X86::COND_B;
+ Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
break;
case ISD::SSUBO:
BaseOp = X86ISD::SUB;
@@ -19867,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
+ assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
}
@@ -20036,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
- SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
+ SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Zero = DAG.getConstant(0, DL, Op.getValueType());
- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
}
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
@@ -20111,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
- unsigned Opc = Cmp.getOpcode();
MVT VT = Op.getSimpleValueType();
bool IllegalFPCMov = false;
@@ -20120,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
- Opc == X86ISD::BT) { // FIXME
+ Cmp.getOpcode() == X86ISD::BT) { // FIXME
Cond = Cmp;
AddTest = false;
}
@@ -20193,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Promote i16 cmovs if it won't prevent folding a load.
- if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+ // Or finally, promote i8 cmovs if we have CMOV,
+ // or i16 cmovs if it won't prevent folding a load.
+ // FIXME: we should not limit promotion of i8 case to only when the CMOV is
+ // legal, but EmitLoweredSelect() can not deal with these extensions
+ // being inserted between two CMOV's. (in i16 case too TBN)
+ // https://bugs.llvm.org/show_bug.cgi?id=40974
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+ (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
+ !MayFoldLoad(Op2))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -20453,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+/// Change a vector store into a pair of half-size vector stores.
+static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert((StoredVal.getValueType().is256BitVector() ||
+ StoredVal.getValueType().is512BitVector()) &&
+ "Expecting 256/512-bit op");
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
+ unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
+
+ SDLoc DL(Store);
+ SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
+ SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+ SDValue Ptr0 = Store->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
+ unsigned Alignment = Store->getAlignment();
+ SDValue Ch0 =
+ DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
+ Alignment, Store->getMemOperand()->getFlags());
+ SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
+ Store->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Store->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
+}
+
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+ SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert(StoreVT.is128BitVector() &&
+ StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+ StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreSVT = StoreVT.getScalarType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned ScalarSize = StoreSVT.getStoreSize();
+ unsigned Alignment = Store->getAlignment();
+
+ SDLoc DL(Store);
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Offset = i * ScalarSize;
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+ DAG.getIntPtrConstant(i, DL));
+ SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+ Store->getPointerInfo().getWithOffset(Offset),
+ MinAlign(Alignment, Offset),
+ Store->getMemOperand()->getFlags());
+ Stores.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -20482,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
if (St->isTruncatingStore())
return SDValue();
+ // If this is a 256-bit store of concatenated ops, we are better off splitting
+ // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
+ // and each half can execute independently. Some cores would split the op into
+ // halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT.is256BitVector()) {
+ SmallVector<SDValue, 4> CatOps;
+ if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ return splitVectorStore(St, DAG);
+ return SDValue();
+ }
+
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
TargetLowering::TypeWidenVector)
return SDValue();
- // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
- // and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
- MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
- MVT CastVT = MVT::getVectorVT(StVT, 2);
- StoredVal = DAG.getBitcast(CastVT, StoredVal);
- StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
+ if (Subtarget.hasSSE2()) {
+ // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+ // and store it.
+ MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+ MVT CastVT = MVT::getVectorVT(StVT, 2);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal);
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
+ St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
@@ -20694,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
unsigned SizeRatio = RegSz / MemSz;
if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
MemVT == MVT::v8i8) {
- SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
@@ -21240,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
- ConstantSDNode *ND;
- switch(Opc) {
+ switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRAI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
}
@@ -21443,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
DAG.getBitcast(MVT::v8i1, Mask),
DAG.getIntPtrConstant(0, dl));
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND ||
+ Op.getOpcode() == X86ISD::FSETCCM_SAE ||
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
@@ -21517,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
- if (!isa<ConstantSDNode>(Rnd))
- return false;
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+
+ return false;
+ };
+ auto isRoundModeSAE = [](SDValue Rnd) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ return false;
+ };
+ auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
+ RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
+ RC == X86::STATIC_ROUNDING::TO_POS_INF ||
+ RC == X86::STATIC_ROUNDING::TO_ZERO;
+ }
+ }
+
+ return false;
};
SDLoc dl(Op);
@@ -21537,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(2);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Rnd);
- }
+ Op.getOperand(1),
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
}
+ case INTR_TYPE_1OP_SAE: {
+ SDValue Sae = Op.getOperand(2);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
+ }
case INTR_TYPE_2OP: {
SDValue Src2 = Op.getOperand(2);
@@ -21553,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(3);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Src2, Rnd);
- }
+ Op.getOperand(1), Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Src2);
}
+ case INTR_TYPE_2OP_SAE: {
+ SDValue Sae = Op.getOperand(3);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
case INTR_TYPE_3OP:
case INTR_TYPE_3OP_IMM8: {
SDValue Src1 = Op.getOperand(1);
@@ -21577,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src1, Src2, Src3,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
@@ -21590,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
- case INTR_TYPE_1OP_MASK_RM: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- SDValue RoundingMode;
- // We always add rounding mode to the Node.
- // If the rounding mode is not specified, we add the
- // "current direction" mode.
- if (Op.getNumOperands() == 4)
- RoundingMode =
- DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- else
- RoundingMode = Op.getOperand(4);
- assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- RoundingMode),
- Mask, PassThru, Subtarget, DAG);
- }
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
// We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
+ // - RC Opcode is specified and
+ // - RC is not "current direction".
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getVectorMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, PassThru, Subtarget, DAG);
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_1OP_MASK_SAE: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Rnd = Op.getOperand(4);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Rnd))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Rnd))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21641,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Op.getNumOperands() == (5U + HasRounding)) {
if (HasRounding) {
SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getScalarMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, passThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Rnd),
- Mask, passThru, Subtarget, DAG);
+ return SDValue();
}
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
Src2),
@@ -21654,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(Op.getNumOperands() == (6U + HasRounding) &&
"Unexpected intrinsic form");
SDValue RoundingMode = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc0;
if (HasRounding) {
SDValue Sae = Op.getOperand(6);
- if (!isRoundModeCurDirection(Sae))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, passThru, Subtarget, DAG);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrWithRoundingModeOpcode;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
}
- case INTR_TYPE_SCALAR_MASK_RM: {
+ case INTR_TYPE_SCALAR_MASK_RND: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue Src0 = Op.getOperand(3);
+ SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // There are 2 kinds of intrinsics in this group:
- // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
- // (2) With rounding mode and sae - 7 operands.
- if (Op.getNumOperands() == 6) {
- SDValue Sae = Op.getOperand(5);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- Sae),
- Mask, Src0, Subtarget, DAG);
- }
- assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
- SDValue RoundingMode = Op.getOperand(5);
- SDValue Sae = Op.getOperand(6);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, Src0, Subtarget, DAG);
+ SDValue Rnd = Op.getOperand(5);
+
+ SDValue NewOp;
+ unsigned RC = 0;
+ if (isRoundModeCurDirection(Rnd))
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ else if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
-
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
+ SDValue NewOp;
+ if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
- // TODO: Intrinsics should have fast-math-flags to propagate.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
- Mask, PassThru, Subtarget, DAG);
+ if (!NewOp)
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK_RM: {
+ case INTR_TYPE_2OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (6 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 6)
- Rnd = Op.getOperand(5);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Rnd),
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_SCALAR_MASK: {
+ case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
- Src2, Src3),
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_MASK: {
+ case INTR_TYPE_3OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3),
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case BLENDV: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
+ Src3 = DAG.getBitcast(MaskVT, Src3);
+
+ // Reverse the operands to match VSELECT order.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
+ }
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21783,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// first.
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
- case CVTPD2PS:
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
- DAG.getIntPtrConstant(0, dl));
- case CVTPD2PS_RND_MASK: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- // We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- }
- assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- DAG.getIntPtrConstant(0, dl)),
- Mask, PassThru, Subtarget, DAG);
- }
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
@@ -21829,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
- SDValue Cmp;
SDValue CC = Op.getOperand(3);
CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Rnd);
+ SDValue Sae = Op.getOperand(4);
+ if (isRoundModeSAE(Sae))
+ return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Sae);
+ if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if (!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC);
-
- return Cmp;
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -21856,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Cmp;
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if(!Cmp.getNode())
+ if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
@@ -21921,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8));
- else
- FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+ else if (isRoundModeSAE(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ else
+ return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
@@ -21940,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
- if (isAllOnesConstant(Mask)) // return data as is
+ if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
return Op.getOperand(1);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- DataToCompress),
- Mask, PassThru, Subtarget, DAG);
+ // Avoid false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, VT);
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
+ Mask);
}
- case FIXUPIMMS:
- case FIXUPIMMS_MASKZ:
case FIXUPIMM:
- case FIXUPIMM_MASKZ:{
+ case FIXUPIMM_MASKZ: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Imm = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
- Src1 : getZeroVector(VT, Subtarget, DAG, dl);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (7 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 7)
- Rnd = Op.getOperand(6);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
- else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM)
+ ? Src1
+ : getZeroVector(VT, Subtarget, DAG, dl);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
+
+ if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
+ return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
+
+ return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
}
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
@@ -22018,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(Results, dl);
}
case CVTPD2PS_MASK:
- case CVTPD2I_MASK:
+ case CVTPD2DQ_MASK:
+ case CVTQQ2PS_MASK:
case TRUNCATE_TO_REG: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
@@ -22049,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
PassThru, Mask);
}
+ case CVTNEPS2BF16_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
+
+ // Break false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
+
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
+ Mask);
+ }
default:
break;
}
@@ -22279,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
- else // This function handles the SP or FP case.
- Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ else { // Handles the SP or FP case.
+ bool CantUseFP = RegInfo->needsStackRealignment(MF);
+ if (CantUseFP)
+ Reg = RegInfo->getPtrSizedStackRegister(MF);
+ else
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ }
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
+
+ case Intrinsic::x86_avx512_vp2intersect_q_512:
+ case Intrinsic::x86_avx512_vp2intersect_q_256:
+ case Intrinsic::x86_avx512_vp2intersect_q_128:
+ case Intrinsic::x86_avx512_vp2intersect_d_512:
+ case Intrinsic::x86_avx512_vp2intersect_d_256:
+ case Intrinsic::x86_avx512_vp2intersect_d_128: {
+ MVT MaskVT = Op.getSimpleValueType();
+
+ SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
+ SDLoc DL(Op);
+
+ SDValue Operation =
+ DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
+ Op->getOperand(1), Op->getOperand(2));
+
+ SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
+ MaskVT, Operation);
+ SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
+ MaskVT, Operation);
+ return DAG.getMergeValues({Result0, Result1}, DL);
+ }
}
}
@@ -22296,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- EVT MaskVT = Mask.getValueType();
+ EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
-static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain,
- const X86Subtarget &Subtarget) {
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
@@ -22332,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22355,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -22366,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res.getValue(1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22392,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
return SDValue(Res, 0);
}
-/// Handles the lowering of builtin intrinsic that return the value
-/// of the extended control register.
-static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
+/// Handles the lowering of builtin intrinsics with chain that return their
+/// value into registers EDX:EAX.
+/// If operand ScrReg is a valid register identifier, then operand 2 of N is
+/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
+/// TargetOpcode.
+/// Returns a Glue value which can be used to add extra copy-from-reg if the
+/// expanded intrinsics implicitly defines extra registers (i.e. not just
+/// EDX:EAX).
+static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ unsigned TargetOpcode,
+ unsigned SrcReg,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
- // The ECX register is used to select the index of the XCR register to
- // return.
- SDValue Chain =
- DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
- SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+ if (SrcReg) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue N1Ops[] = {Chain, Glue};
+ SDNode *N1 = DAG.getMachineNode(
+ TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
Chain = SDValue(N1, 0);
// Reads the content of XCR and returns it in registers EDX:EAX.
+ SDValue LO, HI;
if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
@@ -22420,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
LO.getValue(2));
}
Chain = HI.getValue(1);
+ Glue = HI.getValue(2);
if (Subtarget.is64Bit()) {
- // Merge the two 32-bit values into a 64-bit one..
+ // Merge the two 32-bit values into a 64-bit one.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
- return;
- }
-
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { LO, HI };
- SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
- Results.push_back(Pair);
- Results.push_back(Chain);
-}
-
-/// Handles the lowering of builtin intrinsics that read performance monitor
-/// counters (x86_rdpmc).
-static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
-
- // The ECX register is used to select the index of the performance counter
- // to read.
- SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
- N->getOperand(2));
- SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
-
- // Reads the content of a 64-bit performance counter and returns it in the
- // registers EDX:EAX.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- Chain = HI.getValue(1);
-
- if (Subtarget.is64Bit()) {
- // The EAX register is loaded with the low-order 32 bits. The EDX register
- // is loaded with the supported high-order bits of the counter.
- SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
- Results.push_back(Chain);
- return;
+ return Glue;
}
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
@@ -22481,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(Chain);
+ return Glue;
}
/// Handles the lowering of builtin intrinsics that read the time stamp counter
@@ -22490,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
- SDValue LO, HI;
-
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- SDValue Chain = HI.getValue(1);
-
- SDValue TSC;
- if (Subtarget.is64Bit()) {
- // The EDX register is loaded with the high-order 32 bits of the MSR, and
- // the EAX register is loaded with the low-order 32 bits.
- TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC);
- } else {
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI });
- }
-
- if (Opcode == X86ISD::RDTSCP_DAG) {
- assert(N->getNumOperands() == 2 && "Unexpected number of operands!");
-
- // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
- // the ECX register. Add 'ecx' explicitly to the chain.
- SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
- HI.getValue(2));
-
- Results.push_back(TSC);
- Results.push_back(ecx);
- Results.push_back(ecx.getValue(1));
+ SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+ /* NoRegister */0, Subtarget,
+ Results);
+ if (Opcode != X86::RDTSCP)
return;
- }
- Results.push_back(TSC);
- Results.push_back(Chain);
+ SDValue Chain = Results[1];
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
+ Results[1] = ecx;
+ Results.push_back(ecx.getValue(1));
}
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 3> Results;
SDLoc DL(Op);
- getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
Results);
return DAG.getMergeValues(Results, DL);
}
@@ -22621,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
+ case llvm::Intrinsic::x86_rdpkru: {
+ SDLoc dl(Op);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ // Create a RDPKRU node and pass 0 to the ECX parameter.
+ return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+ case llvm::Intrinsic::x86_wrpkru: {
+ SDLoc dl(Op);
+ // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
+ // to the EDX and ECX parameters.
+ return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
+ Op.getOperand(0), Op.getOperand(2),
+ DAG.getConstant(0, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
case llvm::Intrinsic::x86_flags_read_u32:
case llvm::Intrinsic::x86_flags_read_u64:
case llvm::Intrinsic::x86_flags_write_u32:
@@ -22630,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
- // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ // during FinalizeISel in EmitInstrWithCustomInserter.
return SDValue();
}
case Intrinsic::x86_lwpins32:
@@ -22660,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
- SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ case Intrinsic::x86_enqcmd:
+ case Intrinsic::x86_enqcmds: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic!");
+ case Intrinsic::x86_enqcmd:
+ Opcode = X86ISD::ENQCMD;
+ break;
+ case Intrinsic::x86_enqcmds:
+ Opcode = X86ISD::ENQCMDS;
+ break;
+ }
+ SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
+ Op.getOperand(3));
+ SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
}
@@ -22707,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
}
case SCATTER: {
@@ -22743,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues(Results, dl);
}
// Read Performance Monitoring Counters.
- case RDPMC: {
- SmallVector<SDValue, 2> Results;
- getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
- return DAG.getMergeValues(Results, dl);
- }
- // Get Extended Control Register.
+ case RDPMC:
+ // GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
- getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+
+ // RDPMC uses ECX to select the index of the performance counter to read.
+ // XGETBV uses ECX to select the index of the XCR register to return.
+ // The result is stored into registers EDX:EAX.
+ expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
+ Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
@@ -22861,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
- SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
FuncInfo->setFAIndex(FrameAddrIndex);
}
return DAG.getFrameIndex(FrameAddrIndex, VT);
@@ -23444,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
-
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
@@ -23539,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
return split256IntArith(Op, DAG);
}
-static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
if (VT.getScalarType() == MVT::i1) {
SDLoc dl(Op);
- switch (Op.getOpcode()) {
+ switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
- return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
+ // *addsat i1 X, Y --> X | Y
+ return DAG.getNode(ISD::OR, dl, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
- return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
- DAG.getNOT(dl, Op.getOperand(1), VT));
+ // *subsat i1 X, Y --> X & ~Y
+ return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
}
}
+ if (VT.is128BitVector()) {
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), VT);
+ SDLoc DL(Op);
+ if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
+ // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
+ }
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+ // Use default expansion.
+ return SDValue();
+ }
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -23886,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Signed AVX2 implementation - extend xmm subvectors to ymm.
if (VT == MVT::v32i8 && IsSigned) {
- SDValue Lo = DAG.getIntPtrConstant(0, dl);
- SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
-
MVT ExVT = MVT::v16i16;
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
@@ -23898,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
- Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
@@ -24156,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
APInt APIntShiftAmt;
if (!isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
+
+ // If the shift amount is out of range, return undef.
+ if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
+ return DAG.getUNDEF(VT);
+
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
@@ -24197,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+ APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
+ return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -24224,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-// If V is a splat value, return the source vector and splat index;
-static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
- V = peekThroughEXTRACT_SUBVECTORs(V);
-
- EVT VT = V.getValueType();
- unsigned Opcode = V.getOpcode();
- switch (Opcode) {
- default: {
- APInt UndefElts;
- APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
- if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
- // Handle case where all demanded elements are UNDEF.
- if (DemandedElts.isSubsetOf(UndefElts)) {
- SplatIdx = 0;
- return DAG.getUNDEF(VT);
- }
- SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
- return V;
- }
- break;
- }
- case ISD::VECTOR_SHUFFLE: {
- // Check if this is a shuffle node doing a splat.
- // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
- // getTargetVShiftNode currently struggles without the splat source.
- auto *SVN = cast<ShuffleVectorSDNode>(V);
- if (!SVN->isSplat())
- break;
- int Idx = SVN->getSplatIndex();
- int NumElts = V.getValueType().getVectorNumElements();
- SplatIdx = Idx % NumElts;
- return V.getOperand(Idx / NumElts);
- }
- }
-
- return SDValue();
-}
-
-static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
- SelectionDAG &DAG) {
- int SplatIdx;
- if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- SrcVector.getValueType().getScalarType(), SrcVector,
- DAG.getIntPtrConstant(SplatIdx, dl));
- return SDValue();
-}
-
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -24282,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
- if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
+ if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
@@ -25102,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
- else if (OpWidth == 128)
+ return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+ if (OpWidth == 128)
return Subtarget.hasCmpxchg16b();
- else
- return false;
+
+ return false;
}
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
+// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
- return needsCmpXchgNb(SI->getValueOperand()->getType());
+ Type *MemType = SI->getValueOperand()->getType();
+
+ bool NoImplicitFloatOps =
+ SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+ return false;
+
+ return needsCmpXchgNb(MemType);
}
// Note: this turns large loads into lock cmpxchg8b/16b.
-// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
- auto PTy = cast<PointerType>(LI->getPointerOperandType());
- return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
- : AtomicExpansionKind::None;
+ Type *MemType = LI->getType();
+
+ // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+ // can use movq to do the load. If we have X87 we can load into an 80-bit
+ // X87 register and store it to a stack temporary.
+ bool NoImplicitFloatOps =
+ LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE2() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
@@ -25155,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
@@ -25171,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;
+ // If this is a canonical idempotent atomicrmw w/no uses, we have a better
+ // lowering available in lowerAtomicArith.
+ // TODO: push more cases through this path.
+ if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
+ if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
+ AI->use_empty())
+ return nullptr;
+
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
- auto Ptr = AI->getPointerOperand();
// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -25212,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
- LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
- AI->getType()->getPrimitiveSizeInBits());
+ LoadInst *Loaded =
+ Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
+ AI->getType()->getPrimitiveSizeInBits());
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
}
+/// Emit a locked operation on a stack location which does not change any
+/// memory location, but does involve a lock prefix. Location is chosen to be
+/// a) very likely accessed only by a single thread to minimize cache traffic,
+/// and b) definitely dereferenceable. Returns the new Chain result.
+static SDValue emitLockedStackOp(SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue Chain, SDLoc DL) {
+ // Implementation notes:
+ // 1) LOCK prefix creates a full read/write reordering barrier for memory
+ // operations issued by the current processor. As such, the location
+ // referenced is not relevant for the ordering properties of the instruction.
+ // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 2) Using an immediate operand appears to be the best encoding choice
+ // here since it doesn't require an extra register.
+ // 3) OR appears to be very slightly faster than ADD. (Though, the difference
+ // is small enough it might just be measurement noise.)
+ // 4) When choosing offsets, there are several contributing factors:
+ // a) If there's no redzone, we default to TOS. (We could allocate a cache
+ // line aligned stack object to improve this case.)
+ // b) To minimize our chances of introducing a false dependence, we prefer
+ // to offset the stack usage from TOS slightly.
+ // c) To minimize concerns about cross thread stack usage - in particular,
+ // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
+ // captures state in the TOS frame and accesses it from many threads -
+ // we want to use an offset such that the offset is in a distinct cache
+ // line from the TOS frame.
+ //
+ // For a general discussion of the tradeoffs and benchmark results, see:
+ // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+
+ auto &MF = DAG.getMachineFunction();
+ auto &TFL = *Subtarget.getFrameLowering();
+ const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
+
+ if (Subtarget.is64Bit()) {
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::RSP, MVT::i64), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i64), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+ }
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+}
+
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -25235,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
- SDValue Chain = Op.getOperand(0);
- SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, dl, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(0, dl, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i32), // Segment.
- Zero,
- Chain
- };
- SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops);
- return SDValue(Res, 0);
+ SDValue Chain = Op.getOperand(0);
+ return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
@@ -25288,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
- return SDValue();
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ cpOut, Success, EFLAGS.getValue(1));
}
// Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -25703,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -25717,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
- AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
@@ -25727,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
return N;
}
+ // Specialized lowering for the canonical form of an idemptotent atomicrmw.
+ // The core idea here is that since the memory location isn't actually
+ // changing, all we need is a lowering for the *ordering* impacts of the
+ // atomicrmw. As such, we can chose a different operation and memory
+ // location to minimize impact on other code.
+ if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
+ // On X86, the only ordering which actually requires an instruction is
+ // seq_cst which isn't SingleThread, everything just needs to be preserved
+ // during codegen and then dropped. Note that we expect (but don't assume),
+ // that orderings other than seq_cst and acq_rel have been canonicalized to
+ // a store or load.
+ if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ AN->getSyncScopeID() == SyncScope::System) {
+ // Prefer a locked operation against a stack location to minimize cache
+ // traffic. This assumes that stack locations are very likely to be
+ // accessed only by the owning thread.
+ SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
- DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
- return SDValue();
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), LockOp.getValue(1));
}
-static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
- SDNode *Node = Op.getNode();
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
SDLoc dl(Node);
- EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+ EVT VT = Node->getMemoryVT();
+
+ bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
+
+ // If this store is not sequentially consistent and the type is legal
+ // we can just keep it.
+ if (!IsSeqCst && IsTypeLegal)
+ return Op;
+
+ if (VT == MVT::i64 && !IsTypeLegal) {
+ // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
+ // FIXME: Use movlps with SSE1.
+ // FIXME: Use fist with X87.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ Subtarget.hasSSE2()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
+ Ops, MVT::i64,
+ Node->getMemOperand());
+
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+
+ return Chain;
+ }
+ }
// Convert seq_cst store -> xchg
// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
- // FIXME: On 32-bit, store -> fist or movq would be more efficient
- // (The only way to get a 16-byte store is cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
- if (cast<AtomicSDNode>(Node)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent ||
- !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
- SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
- cast<AtomicSDNode>(Node)->getMemoryVT(),
- Node->getOperand(0),
- Node->getOperand(1), Node->getOperand(2),
- cast<AtomicSDNode>(Node)->getMemOperand());
- return Swap.getValue(1);
- }
- // Other atomic stores have a simple pattern.
- return Op;
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ Node->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ Node->getMemOperand());
+ return Swap.getValue(1);
}
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
@@ -25919,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
return SDValue();
@@ -25935,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
// Custom widen all the operands to avoid promotion.
@@ -25980,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
@@ -25991,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
+ MVT MaskVT = Mask.getSimpleValueType();
+ SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
+ // Handle AVX masked loads which don't support passthru other than 0.
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ // We also allow undef in the isel pattern.
+ if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
+ return Op;
+
+ SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
+ N->getBasePtr(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl),
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType(),
+ N->isExpandingLoad());
+ // Emit a blend.
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
+ PassThru);
+ return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+ }
+
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
@@ -26011,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
- SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG);
+ PassThru = ExtendToType(PassThru, WideDataVT, DAG);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
@@ -26179,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
- case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
@@ -26272,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDSAT:
case ISD::SADDSAT:
case ISD::USUBSAT:
- case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG);
+ case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
@@ -26301,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N,
if (!Res.getNode())
return;
- assert((N->getNumValues() <= Res->getNumValues()) &&
+ // If the original node has one result, take the return value from
+ // LowerOperation as is. It might not be result number 0.
+ if (N->getNumValues() == 1) {
+ Results.push_back(Res);
+ return;
+ }
+
+ // If the original node has multiple results, then the return node should
+ // have the same number of results.
+ assert((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
- // In some cases (LowerSINT_TO_FP for example) Res has more result values
- // than original node, chain should be dropped(last value).
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
}
@@ -26319,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDLoc dl(N);
switch (N->getOpcode()) {
default:
+#ifndef NDEBUG
+ dbgs() << "ReplaceNodeResults: ";
+ N->dump(&DAG);
+#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::CTPOP: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ // Use a v2i64 if possible.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
+ SDValue Wide =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
+ Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
+ // Bit count should fit in 32-bits, extract it as that and then zero
+ // extend to i64. Otherwise we end up extracting bits 63:32 separately.
+ Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
+ Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
+ DAG.getIntPtrConstant(0, dl));
+ Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
+ Results.push_back(Wide);
+ }
+ return;
+ }
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");
@@ -26385,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
+ case ISD::ABS: {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ assert(N->getValueType(0) == MVT::i64 &&
+ "Unexpected type (!= i64) on ABS.");
+ MVT HalfT = MVT::i32;
+ SDValue Lo, Hi, Tmp;
+ SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(0, dl, HalfT));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(1, dl, HalfT));
+ Tmp = DAG.getNode(
+ ISD::SRA, dl, HalfT, Hi,
+ DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
+ TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+ Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+ SDValue(Lo.getNode(), 1));
+ Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+ Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+ Results.push_back(Lo);
+ Results.push_back(Hi);
+ return;
+ }
case ISD::SETCC: {
// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
// setCC result type is v2i1 because type legalzation will end up with
@@ -26557,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
- if (!ExperimentalVectorWideningLegalization)
- return;
-
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
@@ -26589,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
+ if (VT == MVT::v16i32 || VT == MVT::v8i64) {
+ if (!InVT.is128BitVector()) {
+ // Not a 128 bit vector, but maybe type legalization will promote
+ // it to 128 bits.
+ if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
+ return;
+ InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
+ if (!InVT.is128BitVector())
+ return;
+
+ // Promote the input to 128 bits. Type legalization will turn this into
+ // zext_inreg/sext_inreg.
+ In = DAG.getNode(N->getOpcode(), dl, InVT, In);
+ }
+
// Perform custom splitting instead of the two stage extend we would get
// by default.
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
- bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
-
- SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
+ SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
@@ -26608,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
- Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
+ Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
@@ -26735,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- std::pair<SDValue,SDValue> Vals =
- FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode()) {
- // Return a load from the stack slot.
- if (StackSlot.getNode())
- Results.push_back(
- DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
- else
- Results.push_back(FIST);
- }
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
+ Results.push_back(V);
return;
}
case ISD::SINT_TO_FP: {
@@ -26800,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
Results);
case Intrinsic::x86_rdtscp:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
Results);
case Intrinsic::x86_rdpmc:
- return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
-
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
+ Results);
+ return;
case Intrinsic::x86_xgetbv:
- return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
+ Results);
+ return;
}
}
- case ISD::INTRINSIC_WO_CHAIN: {
- if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
- Results.push_back(V);
- return;
- }
case ISD::READCYCLECOUNTER: {
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
- Results);
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
+ assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+ "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
@@ -26903,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(EFLAGS.getValue(1));
return;
}
+ case ISD::ATOMIC_LOAD: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ auto *Node = cast<AtomicSDNode>(N);
+ if (Subtarget.hasSSE2()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
+ // lower 64-bits.
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register. This will put the whole
+ // integer into the significand.
+ // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+ dl, Tys, Ops, MVT::i64,
+ Node->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // Now store the X87 register to a stack temporary and convert to i64.
+ // This store is not atomic and doesn't need to be.
+ // FIXME: We don't need a stack temporary if the result of the load
+ // is already being stored. We could just directly store there.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
+ DAG.getVTList(MVT::Other), StoreOps,
+ MVT::i64, MPI, 0 /*Align*/,
+ MachineMemOperand::MOStore);
+
+ // Finally load the value back from the stack temporary and return it.
+ // This load is not atomic and doesn't need to be.
+ // This load will be further type legalized.
+ Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
+ Results.push_back(Result);
+ Results.push_back(Result.getValue(1));
+ return;
+ }
+ }
+ // TODO: Use MOVLPS when SSE1 is available?
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
@@ -26914,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
- case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
- }
+
case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
@@ -27061,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
- MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
- SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ if (Subtarget.hasSSE2()) {
+ MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+ SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Chain = Res.getValue(1);
+ MVT WideVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ Res = DAG.getBitcast(CastVT, Res);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
- Results.push_back(Chain);
+ Results.push_back(Res.getValue(1));
return;
}
}
@@ -27092,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FILD: return "X86ISD::FILD";
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
- case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
- case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
- case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FIST: return "X86ISD::FIST";
+ case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
case X86ISD::FLD: return "X86ISD::FLD";
case X86ISD::FST: return "X86ISD::FST";
case X86ISD::CALL: return "X86ISD::CALL";
- case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
- case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
- case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
+ case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
- case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
+ case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -27140,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAXS: return "X86ISD::FMAXS";
- case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
- case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
+ case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMINS: return "X86ISD::FMINS";
- case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
- case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
+ case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
+ case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
@@ -27177,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LAND: return "X86ISD::LAND";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
@@ -27188,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
- case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
- case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
+ case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
+ case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
+ case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
+ case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
@@ -27202,6 +28320,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VSHLI: return "X86ISD::VSHLI";
case X86ISD::VSRLI: return "X86ISD::VSRLI";
case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::VSHLV: return "X86ISD::VSHLV";
+ case X86ISD::VSRLV: return "X86ISD::VSRLV";
case X86ISD::VSRAV: return "X86ISD::VSRAV";
case X86ISD::VROTLI: return "X86ISD::VROTLI";
case X86ISD::VROTRI: return "X86ISD::VROTRI";
@@ -27263,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
+ case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
- case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
+ case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
case X86ISD::VRANGES: return "X86ISD::VRANGES";
- case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
+ case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
@@ -27281,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
+ case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
case X86ISD::VPSHA: return "X86ISD::VPSHA";
@@ -27302,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
- case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
+ case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
- case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
+ case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
- case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
+ case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
- case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
+ case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
- case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
+ case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
- case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
+ case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
@@ -27323,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RCP14: return "X86ISD::RCP14";
case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
+ case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
+ case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FADDS: return "X86ISD::FADDS";
case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FSUBS: return "X86ISD::FSUBS";
case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FMULS: return "X86ISD::FMULS";
case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FDIVS: return "X86ISD::FDIVS";
case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
- case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
- case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
+ case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
+ case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
+ case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
+ case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
+ case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
case X86ISD::AVG: return "X86ISD::AVG";
case X86ISD::MULHRS: return "X86ISD::MULHRS";
case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
@@ -27351,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
- case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
- case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
+ case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
+ case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
- case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
- case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
+ case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
+ case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
+ case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
+ case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+ case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
- case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
+ case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
@@ -27378,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
+ case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
+ case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
+ case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
+ case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
case X86ISD::MGATHER: return "X86ISD::MGATHER";
case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
@@ -27393,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
+ case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
+ case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
+ case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
}
return nullptr;
}
@@ -27478,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
return true;
}
+bool X86TargetLowering::isBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // These are non-commutative binops.
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::ANDNP:
+ case X86ISD::PCMPGT:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case X86ISD::FANDN:
+ return true;
+ }
+
+ return TargetLoweringBase::isBinOp(Opcode);
+}
+
+bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::PCMPEQ:
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ:
+ case X86ISD::FMAXC:
+ case X86ISD::FMINC:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ return true;
+ }
+
+ return TargetLoweringBase::isCommutativeBinOp(Opcode);
+}
+
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
@@ -27713,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert input VAL into EAX
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI.getOperand(0).getReg());
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert zero to EDX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
-
- // insert WRPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert RDPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::EAX);
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget,
- unsigned Opc) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX, other two args into ECX, EDX.
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
-
- unsigned ValOps = X86::AddrNumOperands;
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI.getOperand(ValOps).getReg());
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
- .addReg(MI.getOperand(ValOps + 1).getReg());
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(Opc));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI->getOperand(i));
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
-
- MI->eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
MachineBasicBlock *
@@ -27823,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
unsigned ArgMode = MI.getOperand(7).getImm();
unsigned Align = MI.getOperand(8).getImm();
+ MachineFunction *MF = MBB->getParent();
+
// Memory Reference
assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
- SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(),
- MI.memoperands_end());
+
+ MachineMemOperand *OldMMO = MI.memoperands().front();
+
+ // Clone the MMO into two separate MMOs for loading and storing
+ MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
+ MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -27891,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction *MF = MBB->getParent();
overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -27924,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -27933,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
- BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
- .addMBB(overflowMBB);
+ BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(overflowMBB).addImm(X86::COND_AE);
}
// In offsetMBB, emit code to use the reg_save_area.
@@ -27949,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 16)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -27977,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
.addReg(NextOffsetReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -27996,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 8)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
@@ -28033,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, 8)
.add(Segment)
.addReg(NextAddrReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
@@ -28091,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
MBB->addSuccessor(EndMBB);
}
@@ -28371,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// Create the conditional branch instructions.
X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
- unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
X86::CondCode SecondCC =
X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
- unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
- BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
+ BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
@@ -28463,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineInstr *LastCMOV = &MI;
- MachineBasicBlock::iterator NextMIIt =
- std::next(MachineBasicBlock::iterator(MI));
+ MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
- // See if we have a string of CMOVS with the same condition.
+ // See if we have a string of CMOVS with the same condition. Skip over
+ // intervening debug insts.
while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
++NextMIIt;
+ NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
}
}
@@ -28508,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
SinkMBB->addLiveIn(X86::EFLAGS);
}
+ // Transfer any debug instructions inside the CMOV sequence to the sunk block.
+ auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
+ auto DbgIt = MachineBasicBlock::iterator(MI);
+ while (DbgIt != DbgEnd) {
+ auto Next = std::next(DbgIt);
+ if (DbgIt->isDebugInstr())
+ SinkMBB->push_back(DbgIt->removeFromParent());
+ DbgIt = Next;
+ }
+
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
- SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ SinkMBB->splice(SinkMBB->end(), ThisMBB,
std::next(MachineBasicBlock::iterator(LastCMOV)),
ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
@@ -28522,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
FalseMBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
- unsigned Opc = X86::GetCondBranchFromCond(CC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// SinkMBB:
// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
@@ -28540,53 +29655,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- // Combine the following atomic floating-point modification pattern:
- // a.store(reg OP a.load(acquire), release)
- // Transform them into:
- // OPss (%gpr), %xmm
- // movss %xmm, (%gpr)
- // Or sd equivalent for 64-bit operations.
- unsigned MOp, FOp;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
- case X86::RELEASE_FADD32mr:
- FOp = X86::ADDSSrm;
- MOp = X86::MOVSSmr;
- break;
- case X86::RELEASE_FADD64mr:
- FOp = X86::ADDSDrm;
- MOp = X86::MOVSDmr;
- break;
- }
- const X86InstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned ValOpIdx = X86::AddrNumOperands;
- unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, DL, TII->get(FOp),
- MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
- .addReg(VSrc);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand &Operand = MI.getOperand(i);
- // Clear any kill flags on register operands as we'll create a second
- // instruction using the same address operands.
- if (Operand.isReg())
- Operand.setIsKill(false);
- MIB.add(Operand);
- }
- MachineInstr *FOpMI = MIB;
- MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
- MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
- MI.eraseFromParent(); // The pseudo instruction is gone now.
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
@@ -28652,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
@@ -29279,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
.addReg(SSPCopyReg)
.addReg(SSPCopyReg);
- BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
checkSspMBB->addSuccessor(sinkMBB);
checkSspMBB->addSuccessor(fallMBB);
@@ -29309,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addReg(SSPCopyReg);
// Jump to sink in case PrevSSPReg <= SSPCopyReg.
- BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
+ BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
fallMBB->addSuccessor(sinkMBB);
fallMBB->addSuccessor(fixShadowMBB);
@@ -29332,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addImm(8);
// Jump if the result of the shift is zero.
- BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
fixShadowMBB->addSuccessor(sinkMBB);
fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
@@ -29367,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
// Jump if the counter is not zero yet.
- BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
fixShadowLoopMBB->addSuccessor(sinkMBB);
fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
@@ -29512,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- int FI = MFI.getFunctionContextIndex();
+ int FI = MF->getFrameInfo().getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
@@ -29613,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
.addReg(IReg)
.addImm(LPadList.size());
- BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
+ BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
@@ -29766,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -29821,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case X86::RELEASE_FADD32mr:
- case X86::RELEASE_FADD64mr:
- return EmitLoweredAtomicFP(MI, BB);
-
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
@@ -29836,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FNSTCW16m)), CWFrameIdx);
+ TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
- // Load the old value of the high byte of the control word...
+ // Load the old value of the control word...
unsigned OldCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+ OrigCWFrameIdx);
+
+ // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
+ unsigned NewCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+ .addReg(OldCW, RegState::Kill).addImm(0xC00);
+
+ // Extract to 16 bits.
+ unsigned NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
- CWFrameIdx);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+ .addReg(NewCW, RegState::Kill, X86::sub_16bit);
- // Set the high part to be round to zero...
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
- .addImm(0xC7F);
+ // Prepare memory for FLDCW.
+ int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+ NewCWFrameIdx)
+ .addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
-
- // Restore the memory image of control word to original value
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
- .addReg(OldCW);
+ TII->get(X86::FLDCW16m)), NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
@@ -29879,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
+ TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
- // Thread synchronization.
- case X86::MONITOR:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
- case X86::MONITORX:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
-
- // Cache line zero
- case X86::CLZERO:
- return emitClzero(&MI, BB, Subtarget);
-
- // PKU feature
- case X86::WRPKRU:
- return emitWRPKRU(MI, BB, Subtarget);
- case X86::RDPKRU:
- return emitRDPKRU(MI, BB, Subtarget);
+
// xbegin
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
@@ -30093,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
- Known = Known.zextOrTrunc(BitWidth);
+ Known = Known.zextOrTrunc(BitWidth, false);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
@@ -30150,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = Known.trunc(BitWidth);
break;
}
+ case X86ISD::ANDNP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // ANDNP = (~X & Y);
+ Known.One &= Known2.Zero;
+ Known.Zero |= Known2.One;
+ break;
+ }
+ case X86ISD::FOR: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // Output known-0 bits are only known if clear in both the LHS & RHS.
+ Known.Zero &= Known2.Zero;
+ // Output known-1 are known to be set if set in either the LHS | RHS.
+ Known.One |= Known2.One;
+ break;
+ }
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
// If we don't know any bits, early out.
@@ -30219,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
- unsigned VTBits = Op.getScalarValueSizeInBits();
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case X86ISD::SETCC_CARRY:
@@ -30257,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30268,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ APInt ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits - 1))
return VTBits; // Sign splat.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30284,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
// Vector compares return zero/all-bits result values.
return VTBits;
+ case X86ISD::ANDNP: {
+ unsigned Tmp0 =
+ DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ return std::min(Tmp0, Tmp1);
+ }
+
case X86ISD::CMOV: {
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp0 == 1) return 1; // Early out.
@@ -30292,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
}
}
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opcode)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ return 1;
+ } else if (M == SM_SentinelZero) {
+ // Zero = all sign bits.
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ return 1;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ unsigned Tmp0 = VTBits;
+ for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
+ if (!DemandedOps[i])
+ continue;
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
+ Tmp0 = std::min(Tmp0, Tmp1);
+ }
+ return Tmp0;
+ }
+ }
+ }
+
// Fallback case.
return 1;
}
@@ -30305,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, unsigned &Shuffle,
+ MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
@@ -30322,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return true;
}
- // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
- bool Match = true;
+ bool MatchAny = true;
+ bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
- for (unsigned i = 0; i != NumDstElts && Match; ++i) {
- Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
- Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
+ MatchAny = MatchZero = false;
+ break;
+ }
+ MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
+ MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
- if (Match) {
+ if (MatchAny || MatchZero) {
+ assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
MVT::getIntegerVT(MaskEltSize);
@@ -30343,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
- if (SrcVT.getVectorNumElements() == NumDstElts)
- Shuffle = unsigned(ISD::ZERO_EXTEND);
- else
- Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
+ Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ if (SrcVT.getVectorNumElements() != NumDstElts)
+ Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
@@ -30368,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
@@ -30426,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- // Attempt to match against broadcast-from-vector.
- if (Subtarget.hasAVX2()) {
- SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
- if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
- SrcVT = DstVT = MaskVT;
- Shuffle = X86ISD::VBROADCAST;
- return true;
- }
- }
-
return false;
}
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- const APInt &Zeroable,
- bool AllowFloatDomain,
- bool AllowIntDomain,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
- unsigned &PermuteImm) {
+static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
@@ -30549,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskScalarSizeInBits, Mask,
- 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
+ Mask, 0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
return true;
@@ -30564,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, SDValue &V2, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
- bool IsUnary) {
+static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
+ bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
@@ -30631,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return false;
}
-static bool matchBinaryPermuteVectorShuffle(
+static bool matchBinaryPermuteShuffle(
MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
@@ -30642,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle(
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
@@ -30678,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle(
return true;
}
} else {
- // Determine a type compatible with X86ISD::BLENDI.
- ShuffleVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v8i32;
- else if (ShuffleVT == MVT::v2i64)
- ShuffleVT = MVT::v4i32;
- } else {
- if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
- ShuffleVT = MVT::v8i16;
- else if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v4f64;
- else if (ShuffleVT == MVT::v8i32)
- ShuffleVT = MVT::v8f32;
- }
-
- if (!ShuffleVT.isFloatingPoint()) {
- int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
- BlendMask =
- scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
- ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
- }
-
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
return true;
}
}
@@ -30715,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle(
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector()) {
if (Zeroable.getBoolValue() &&
- matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
@@ -30727,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle(
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
- if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
@@ -30784,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle(
return false;
}
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget);
+
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
@@ -30841,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
bool IsEVEXShuffle =
RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ // Attempt to match a subvector broadcast.
+ // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
+ if (UnaryShuffle &&
+ (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
+ SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
+ if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
+ SDValue Src = Inputs[0];
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(0).isUndef() &&
+ Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
+ MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
+ return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
+ Src.getValueType(),
+ Src.getOperand(1)));
+ }
+ }
+ }
+
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
@@ -30894,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
+ // TODO: Should we indicate which domain is preferred if both are allowed?
bool AllowFloatDomain = FloatDomain || (Depth > 3);
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
@@ -30909,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
+ MaskEltSizeInBits) == 0) {
+ unsigned Scale =
+ cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
+ MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
@@ -30918,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Attempt to match against broadcast-from-vector.
+ // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
+ if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
+ && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+ SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
+ if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ if (V1.getValueType() == MaskVT &&
+ V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ MayFoldLoad(V1.getOperand(0))) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = V1.getOperand(0);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ if (Subtarget.hasAVX2()) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getBitcast(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ }
+
SDValue NewV1 = V1; // Save operand in case early exit happens.
- if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT) &&
+ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30930,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, Subtarget, Shuffle,
- ShuffleVT, PermuteImm) &&
+ if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30945,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
- if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
+ if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30959,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
- if (matchBinaryPermuteVectorShuffle(
+ if (matchBinaryPermuteShuffle(
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
@@ -30979,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Annoyingly, SSE4A instructions don't map into the above match helpers.
if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
- Zeroable)) {
+ if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -30990,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -31057,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
@@ -31222,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
+ // If we have a dual input shuffle then lower to VPERMV3.
+ if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
+ MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = DAG.getBitcast(MaskVT, V2);
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
// Failed to find any combines.
return SDValue();
}
+// Combine an arbitrary chain of shuffles + extract_subvectors into a single
+// instruction if possible.
+//
+// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
+// type size to attempt to combine:
+// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
+// -->
+// extract_subvector(shuffle(x,y,m2),0)
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NumMaskElts = BaseMask.size();
+ unsigned NumInputs = Inputs.size();
+ if (NumInputs == 0)
+ return SDValue();
+
+ SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
+ SmallVector<unsigned, 4> Offsets(NumInputs, 0);
+
+ // Peek through subvectors.
+ // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
+ unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
+ for (unsigned i = 0; i != NumInputs; ++i) {
+ SDValue &Src = WideInputs[i];
+ unsigned &Offset = Offsets[i];
+ Src = peekThroughBitcasts(Src);
+ EVT BaseVT = Src.getValueType();
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(Src.getOperand(1))) {
+ Offset += Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+ }
+ WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
+ assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
+ "Unexpected subvector extraction");
+ Offset /= BaseVT.getVectorNumElements();
+ Offset *= NumMaskElts;
+ }
+
+ // Bail if we're always extracting from the lowest subvectors,
+ // combineX86ShuffleChain should match this for the current width.
+ if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
+ return SDValue();
+
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned Scale = WideSizeInBits / RootSizeInBits;
+ assert((WideSizeInBits % RootSizeInBits) == 0 &&
+ "Unexpected subvector extraction");
+
+ // If the src vector types aren't the same, see if we can extend
+ // them to match each other.
+ // TODO: Support different scalar types?
+ EVT WideSVT = WideInputs[0].getValueType().getScalarType();
+ if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
+ return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
+ Op.getValueType().getScalarType() != WideSVT;
+ }))
+ return SDValue();
+
+ for (SDValue &NewInput : WideInputs) {
+ assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
+ "Shuffle vector size mismatch");
+ if (WideSizeInBits > NewInput.getValueSizeInBits())
+ NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
+ SDLoc(NewInput), WideSizeInBits);
+ assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
+ "Unexpected subvector extraction");
+ }
+
+ // Create new mask for larger type.
+ for (unsigned i = 1; i != NumInputs; ++i)
+ Offsets[i] += i * Scale * NumMaskElts;
+
+ SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
+ for (int &M : WideMask) {
+ if (M < 0)
+ continue;
+ M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
+ }
+ WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
+
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
+ assert(!WideInputs.empty() && "Shuffle with no inputs detected");
+
+ if (WideInputs.size() > 2)
+ return SDValue();
+
+ // Increase depth for every upper subvector we've peeked through.
+ Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
+
+ // Attempt to combine wider chain.
+ // TODO: Can we use a better Root?
+ SDValue WideRoot = WideInputs[0];
+ if (SDValue WideShuffle = combineX86ShuffleChain(
+ WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget)) {
+ WideShuffle =
+ extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
+ return DAG.getBitcast(RootVT, WideShuffle);
+ }
+ return SDValue();
+}
+
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -31370,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively(
if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
return SDValue();
- // TODO - Add support for more than 2 inputs.
- if (2 < OpInputs.size())
- return SDValue();
-
- SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
- SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
-
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
- if (!Input)
- return -1;
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
@@ -31398,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively(
return Ops.size() - 1;
};
- int InputIdx0 = AddOp(Input0, SrcOpIndex);
- int InputIdx1 = AddOp(Input1, -1);
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
@@ -31471,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively(
: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
- if (OpMask[OpIdx] < (int)OpMask.size()) {
- assert(0 <= InputIdx0 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx0 * MaskWidth;
- } else {
- assert(0 <= InputIdx1 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx1 * MaskWidth;
- }
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
Mask[i] = OpMaskedIdx;
}
@@ -31493,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively(
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
- // Remove unused shuffle source ops.
+ // Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
@@ -31530,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively(
return Cst;
// We can only combine unary and binary shuffle mask cases.
- if (Ops.size() > 2)
- return SDValue();
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 64> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
- Mask = std::move(WidenedMask);
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
- }
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
+}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+/// Helper entry wrapper to combineX86ShufflesRecursively.
+static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
@@ -31770,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
switch (Opcode) {
case X86ISD::VBROADCAST: {
- // If broadcasting from another shuffle, attempt to simplify it.
- // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
EVT SrcVT = Src.getValueType();
EVT BCVT = BC.getValueType();
+
+ // If broadcasting from another shuffle, attempt to simplify it.
+ // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
if (isTargetShuffle(BC.getOpcode()) &&
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
@@ -31789,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
+
+ // broadcast(bitcast(src)) -> bitcast(broadcast(src))
+ // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
+ if (Src.getOpcode() == ISD::BITCAST &&
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
+ }
+
+ // Reduce broadcast source vector to lowest 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ extract128BitVector(Src, 0, DAG, DL));
+
+ // broadcast(scalar_to_vector(x)) -> broadcast(x).
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
+ // Share broadcast with the longest vector and extract low subvector (free).
+ for (SDNode *User : Src->uses())
+ if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL,
+ VT.getSizeInBits());
+ }
+
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+ // TODO: Handle MVT::v16i16 repeated blend mask.
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+ MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+ if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+ SrcVT.getScalarSizeInBits() >= 32) {
+ unsigned Mask = N.getConstantOperandVal(2);
+ unsigned Size = VT.getVectorNumElements();
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+ unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getConstant(ScaleMask, DL, MVT::i8)));
+ }
+ }
+ return SDValue();
+ }
+ case X86ISD::VPERMI: {
+ // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
+ // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ if (N0.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
+ return DAG.getBitcast(VT, Res);
+ }
return SDValue();
}
case X86ISD::PSHUFD:
@@ -32212,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N) {
- if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ // For a broadcast, peek through an extract element of index 0 to find the
+ // horizontal op: broadcast (ext_vec_elt HOp, 0)
+ EVT VT = N->getValueType(0);
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDValue SrcOp = N->getOperand(0);
+ if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ SrcOp.getValueType() == MVT::f64 &&
+ SrcOp.getOperand(0).getValueType() == VT &&
+ isNullConstant(SrcOp.getOperand(1)))
+ N = SrcOp.getNode();
+ }
SDValue HOp = N->getOperand(0);
if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
@@ -32224,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
- // TODO: Handle UNDEF operands.
- if (HOp.getOperand(0) != HOp.getOperand(1))
+ if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
+ HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// When the operands of a horizontal math op are identical, the low half of
- // the result is the same as the high half. If the shuffle is also replicating
- // low and high halves, we don't need the shuffle.
+ // the result is the same as the high half. If a target shuffle is also
+ // replicating low and high halves, we don't need the shuffle.
+ if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+ if (HOp.getScalarValueSizeInBits() == 64) {
+ // movddup (hadd X, X) --> hadd X, X
+ // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
+ assert((HOp.getValueType() == MVT::v2f64 ||
+ HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
+ "Unexpected type for h-op");
+ return HOp;
+ }
+ return SDValue();
+ }
+
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
@@ -32252,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
return SDValue();
}
+/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
+/// low half of each source vector and does not set any high half elements in
+/// the destination vector, narrow the shuffle to half its original size.
+static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
+ if (!Shuf->getValueType(0).isSimple())
+ return SDValue();
+ MVT VT = Shuf->getSimpleValueType(0);
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // See if we can ignore all of the high elements of the shuffle.
+ ArrayRef<int> Mask = Shuf->getMask();
+ if (!isUndefUpperHalf(Mask))
+ return SDValue();
+
+ // Check if the shuffle mask accesses only the low half of each input vector
+ // (half-index output is 0 or 2).
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(Mask.size() / 2);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
+ (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
+ return SDValue();
+
+ // Create a half-width shuffle to replace the unnecessarily wide shuffle.
+ // The trick is knowing that all of the insert/extract are actually free
+ // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
+ // of narrow inputs into a narrow output, and that is always cheaper than
+ // the wide shuffle that we started with.
+ return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
+ Shuf->getOperand(1), HalfMask, HalfIdx1,
+ HalfIdx2, false, DAG);
+}
+
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
+ if (SDValue V = narrowShuffle(Shuf, DAG))
+ return V;
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we have legalized the vector types, look for blends of FADD and FSUB
- // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
@@ -32328,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
}
- // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
- // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
- // consecutive, non-overlapping, and in the right order.
- SmallVector<SDValue, 16> Elts;
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
- Elts.push_back(Elt);
- continue;
- }
- Elts.clear();
- break;
- }
-
- if (Elts.size() == VT.getVectorNumElements())
- if (SDValue LD =
- EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
- return LD;
+ // Attempt to combine into a vector load/broadcast.
+ if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+ return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -32365,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Simplify source operands based on shuffle mask.
@@ -32378,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
+ // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
+ // in the upper 64 bits.
+ // TODO: Can we generalize this using computeKnownBits.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
+ (VT == MVT::v2f64 || VT == MVT::v2i64) &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
+ N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
+ SDValue In = N->getOperand(0).getOperand(0);
+ switch (In.getOpcode()) {
+ default:
+ break;
+ case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
+ case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
+ case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
+ if (In.getOperand(0).getValueType() == MVT::v2f64 ||
+ In.getOperand(0).getValueType() == MVT::v2i64)
+ return N->getOperand(0); // return the bitcast
+ break;
+ }
+ }
+
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
+ N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
+ N->getOperand(0).hasOneUse() &&
+ N->getOperand(0).getOperand(0).isUndef() &&
+ isNullConstant(N->getOperand(0).getOperand(2))) {
+ SDValue In = N->getOperand(0).getOperand(1);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, N->getOperand(0).getOperand(2));
+ }
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(N->getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ if (!LN->isVolatile()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ VT.getVectorElementType(),
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ MachineMemOperand::MOLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return VZLoad;
+ }
+ }
+
+
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
@@ -32436,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Handle special case opcodes.
switch (Opc) {
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+ Depth + 1))
+ return true;
+ // Multiply by zero.
+ KnownZero = LHSZero | RHSZero;
+ break;
+ }
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
@@ -32443,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Amt = Op.getOperand(1);
MVT AmtVT = Amt.getSimpleValueType();
assert(AmtVT.is128BitVector() && "Unexpected value type");
+
+ // If we reuse the shift amount just for sse shift amounts then we know that
+ // only the bottom 64-bits are only ever used.
+ bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
+ unsigned UseOpc = Use->getOpcode();
+ return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
+ UseOpc == X86ISD::VSRA) &&
+ Use->getOperand(0) != Amt;
+ });
+
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
- Depth + 1))
+ Depth + 1, AssumeSingleUse))
return true;
LLVM_FALLTHROUGH;
}
@@ -32487,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ APInt DemandedLHS, DemandedRHS;
+ getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ KnownZero = SrcZero.zextOrTrunc(NumElts);
+ KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+ break;
+ }
+ case X86ISD::BLENDV: {
+ APInt SelUndef, SelZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
+ SelZero, TLO, Depth + 1))
+ return true;
+
+ // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+
+ KnownZero = LHSZero & RHSZero;
+ KnownUndef = LHSUndef & RHSUndef;
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32494,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return false;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
- if(Src.getValueType() != VT)
+ if (Src.getValueType() != VT)
Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
SDLoc(Op));
return TLO.CombineTo(Op, Src);
@@ -32506,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
- case X86ISD::PSHUFB: {
- // TODO - simplify other variable shuffle masks.
+ case X86ISD::SUBV_BROADCAST: {
+ // Reduce size of broadcast if we don't need the upper half.
+ unsigned HalfElts = NumElts / 2;
+ if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ SDValue Half = Src;
+ if (SrcVT.getVectorNumElements() != HalfElts) {
+ MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
+ Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
+ }
+
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
+ TLO.DAG, SDLoc(Op),
+ Half.getValueSizeInBits()));
+ }
+ break;
+ }
+ case X86ISD::VPERMV: {
+ SDValue Mask = Op.getOperand(0);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMILPV: {
SDValue Mask = Op.getOperand(1);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
@@ -32515,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMIL2: {
+ SDValue Mask = Op.getOperand(2);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ }
+
+ // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
+ // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
+ // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ DemandedElts.lshr(NumElts / 2) == 0) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned ExtSizeInBits = SizeInBits / 2;
+
+ // See if 512-bit ops only use the bottom 128-bits.
+ if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
+ ExtSizeInBits = SizeInBits / 4;
+
+ switch (Opc) {
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ // Byte shifts by immediate.
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ // Shift by uniform.
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ // Shift by immediate.
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ case X86ISD::VPERMI: {
+ // Simplify PERMPD/PERMQ to extract_subvector.
+ // TODO: This should be done in shuffle combining.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64) {
+ SmallVector<int, 4> Mask;
+ DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
+ if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
+ SDLoc DL(Op);
+ SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ // Target Shuffles.
+ case X86ISD::PSHUFB:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // Saturated Packs.
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // Horizontal Ops.
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ SDLoc DL(Op);
+ MVT ExtVT = VT.getSimpleVT();
+ ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
+ ExtSizeInBits / ExtVT.getScalarSizeInBits());
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue Ext1 =
+ extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
}
// Simplify target shuffles.
@@ -32606,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
- if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
break;
}
@@ -32727,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumVecElts = VecVT.getVectorNumElements();
+
+ if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
+ unsigned Idx = CIdx->getZExtValue();
+ unsigned VecBitWidth = VecVT.getScalarSizeInBits();
+
+ // If we demand no bits from the vector then we must have demanded
+ // bits from the implict zext - simplify to zero.
+ APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
+ if (DemandedVecBits == 0)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ APInt KnownUndef, KnownZero;
+ APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
+ if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+ KnownZero, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownVec;
+ if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ Known = KnownVec.zext(BitWidth, true);
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Scl = Op.getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+ unsigned Idx = CIdx->getZExtValue();
+ if (!OriginalDemandedElts[Idx])
+ return TLO.CombineTo(Op, Vec);
+
+ KnownBits KnownVec;
+ APInt DemandedVecElts(OriginalDemandedElts);
+ DemandedVecElts.clearBit(Idx);
+ if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownScl;
+ unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+ APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
+ if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+ return true;
+
+ KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
+ Known.One = KnownVec.One & KnownScl.One;
+ Known.Zero = KnownVec.Zero & KnownScl.Zero;
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PACKSS:
+ // PACKSS saturates to MIN/MAX integer values. So if we just want the
+ // sign bit then we can just ask for the source operands sign bit.
+ // TODO - add known bits handling.
+ if (OriginalDemandedBits.isSignMask()) {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
+
+ KnownBits KnownLHS, KnownRHS;
+ APInt SignMask = APInt::getSignMask(BitWidth * 2);
+ if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
+ KnownLHS, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
+ KnownRHS, TLO, Depth + 1))
+ return true;
+ }
+ // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
+ break;
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32868,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
+// Helper to peek through bitops/setcc to determine size of source vector.
+// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return Src.getOperand(0).getValueSizeInBits() == Size;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ }
+ return false;
+}
+
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the illegal vector is scalarized on subtargets that don't have legal
// vxi1 types.
-static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
+ const SDLoc &DL,
const X86Subtarget &Subtarget) {
- EVT VT = BitCast.getValueType();
- SDValue N0 = BitCast.getOperand(0);
- EVT VecVT = N0->getValueType(0);
-
- if (!VT.isScalarInteger() || !VecVT.isSimple())
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
- bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
- (N0.getOperand(0).getValueType() == MVT::v16i8 ||
- N0.getOperand(0).getValueType() == MVT::v32i8 ||
- N0.getOperand(0).getValueType() == MVT::v64i8);
+ bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
@@ -32908,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
- switch (VecVT.getSimpleVT().SimpleTy) {
+ switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v2i1:
@@ -32918,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- N0->getOperand(0).getValueType().is256BitVector()) {
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
SExtVT = MVT::v4i64;
- }
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
@@ -32930,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- (N0->getOperand(0).getValueType().is256BitVector() ||
- N0->getOperand(0).getValueType().is512BitVector())) {
+ // TODO : use checkBitcastSrcVectorSize
+ if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
+ (Src.getOperand(0).getValueType().is256BitVector() ||
+ Src.getOperand(0).getValueType().is512BitVector())) {
SExtVT = MVT::v8i32;
}
break;
@@ -32956,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
return SDValue();
};
- SDLoc DL(BitCast);
- SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0);
+ SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
if (SExtVT == MVT::v64i8) {
SDValue Lo, Hi;
@@ -32977,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
DAG.getUNDEF(MVT::v8i16));
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}
- return DAG.getZExtOrTrunc(V, DL, VT);
+
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, IntVT);
+ return DAG.getBitcast(VT, V);
}
// Convert a vXi1 constant build vector to the same width scalar integer.
@@ -33054,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDLoc DL(N);
- unsigned NumElts = N.getNumOperands();
-
- auto *BV = cast<BuildVectorSDNode>(N);
+ SDLoc DL(BV);
+ unsigned NumElts = BV->getNumOperands();
SDValue Splat = BV->getSplatValue();
// Build MMX element from integer GPR or SSE float values.
@@ -33107,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
Ops.append(NumElts, Splat);
} else {
for (unsigned i = 0; i != NumElts; ++i)
- Ops.push_back(CreateMMXElement(N.getOperand(i)));
+ Ops.push_back(CreateMMXElement(BV->getOperand(i)));
}
// Use tree of PUNPCKLs to build up general MMX vector.
@@ -33141,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
if (DCI.isBeforeLegalize()) {
- if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+ SDLoc dl(N);
+ if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
N0 = DAG.getBitcast(MVT::v8i1, N0);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
@@ -33159,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
@@ -33213,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
SrcVT == MVT::v8i8))
- return createMMXBuildVector(N0, DAG, Subtarget);
+ return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
@@ -33297,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Given a select, detect the following pattern:
-// 1: %2 = zext <N x i8> %0 to <N x i32>
-// 2: %3 = zext <N x i8> %1 to <N x i32>
-// 3: %4 = sub nsw <N x i32> %2, %3
-// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
-// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
-// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
// This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
- SDValue &Op1) {
- // Check the condition of the select instruction is greater-than.
- SDValue SetCC = Select->getOperand(0);
- if (SetCC.getOpcode() != ISD::SETCC)
- return false;
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT && CC != ISD::SETLT)
- return false;
-
- SDValue SelectOp1 = Select->getOperand(1);
- SDValue SelectOp2 = Select->getOperand(2);
-
- // The following instructions assume SelectOp1 is the subtraction operand
- // and SelectOp2 is the negation operand.
- // In the case of SETLT this is the other way around.
- if (CC == ISD::SETLT)
- std::swap(SelectOp1, SelectOp2);
-
- // The second operand of the select should be the negation of the first
- // operand, which is implemented as 0 - SelectOp1.
- if (!(SelectOp2.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
- SelectOp2.getOperand(1) == SelectOp1))
- return false;
-
- // The first operand of SetCC is the first operand of the select, which is the
- // difference between the two input vectors.
- if (SetCC.getOperand(0) != SelectOp1)
- return false;
-
- // In SetLT case, The second operand of the comparison can be either 1 or 0.
- APInt SplatVal;
- if ((CC == ISD::SETLT) &&
- !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isOneValue()) ||
- (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
+ SDValue AbsOp1 = Abs->getOperand(0);
+ if (AbsOp1.getOpcode() != ISD::SUB)
return false;
- // In SetGT case, The second operand of the comparison can be either -1 or 0.
- if ((CC == ISD::SETGT) &&
- !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
- ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
- return false;
-
- // The first operand of the select is the difference between the two input
- // vectors.
- if (SelectOp1.getOpcode() != ISD::SUB)
- return false;
-
- Op0 = SelectOp1.getOperand(0);
- Op1 = SelectOp1.getOperand(1);
+ Op0 = AbsOp1.getOperand(0);
+ Op1 = AbsOp1.getOperand(1);
// Check if the operands of the sub are zero-extended from vectors of i8.
if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
@@ -33476,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
-// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Bail without SSE2 or with AVX512VL (which uses predicate registers).
- if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+ // Bail without SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
unsigned BitWidth = ExtractVT.getSizeInBits();
if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
- ExtractVT != MVT::i8)
+ ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
return SDValue();
- // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+ // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+ if (!Match && ExtractVT == MVT::i1)
+ Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
if (!Match)
return SDValue();
@@ -33501,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
if (Match.getScalarValueSizeInBits() != BitWidth)
return SDValue();
- // We require AVX2 for PMOVMSKB for v16i16/v32i8;
- unsigned MatchSizeInBits = Match.getValueSizeInBits();
- if (!(MatchSizeInBits == 128 ||
- (MatchSizeInBits == 256 &&
- ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
- return SDValue();
+ SDValue Movmsk;
+ SDLoc DL(Extract);
+ EVT MatchVT = Match.getValueType();
+ unsigned NumElts = MatchVT.getVectorNumElements();
- // Don't bother performing this for 2-element vectors.
- if (Match.getValueType().getVectorNumElements() <= 2)
- return SDValue();
+ if (ExtractVT == MVT::i1) {
+ // Special case for (pre-legalization) vXi1 reductions.
+ if (NumElts > 32)
+ return SDValue();
+ if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+ // If this is a legal AVX512 predicate type then we can just bitcast.
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = DAG.getBitcast(MovmskVT, Match);
+ } else {
+ // Use combineBitcastvxi1 to create the MOVMSK.
+ if (NumElts == 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ NumElts = 16;
+ }
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
+ }
+ if (!Movmsk)
+ return SDValue();
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+ } else {
+ // Bail with AVX512VL (which uses predicate registers).
+ if (Subtarget.hasVLX())
+ return SDValue();
- // Check that we are extracting a reduction of all sign bits.
- if (DAG.ComputeNumSignBits(Match) != BitWidth)
- return SDValue();
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 && Subtarget.hasAVX())))
+ return SDValue();
- // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
- MVT MaskVT;
- if (64 == BitWidth || 32 == BitWidth)
- MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
- MatchSizeInBits / BitWidth);
- else
- MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+ // Make sure this isn't a vector of 1 element. The perf win from using
+ // MOVMSK diminishes with less elements in the reduction, but it is
+ // generally better to get the comparison over to the GPRs as soon as
+ // possible to reduce the number of vector ops.
+ if (Match.getValueType().getVectorNumElements() < 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ MatchSizeInBits = Match.getValueSizeInBits();
+ }
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskSrcVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
+ Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
+ NumElts = MaskSrcVT.getVectorNumElements();
+ }
+ assert(NumElts <= 32 && "Not expecting more than 32 elements");
- APInt CompareBits;
+ if (BinOp == ISD::XOR) {
+ // parity -> (AND (CTPOP(MOVMSK X)), 1)
+ SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+ return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
+ }
+
+ SDValue CmpC;
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
- CompareBits = APInt::getNullValue(32);
+ CmpC = DAG.getConstant(0, DL, MVT::i32);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
- CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
CondCode = ISD::CondCode::SETEQ;
}
- // Perform the select as i32/i64 and then truncate to avoid partial register
- // stalls.
- unsigned ResWidth = std::max(BitWidth, 32u);
- EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
- SDLoc DL(Extract);
- SDValue Zero = DAG.getConstant(0, DL, ResVT);
- SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
- SDValue Res = DAG.getBitcast(MaskVT, Match);
- Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
- Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
- Ones, Zero, CondCode);
- return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ // The setcc produces an i8 of 0/1, so extend that to the result width and
+ // negate to get the final 0/-1 mask value.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetccVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
+ SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
+ SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
+ return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
}
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
@@ -33592,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
- if (!Root || (Root.getOpcode() != ISD::VSELECT))
+ if (!Root || Root.getOpcode() != ISD::ABS)
return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
@@ -33651,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ SDValue SrcBC = peekThroughBitcasts(Src);
+
// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
- if (X86ISD::VBROADCAST == Src.getOpcode() &&
- Src.getOperand(0).getValueType() == VT)
- return Src.getOperand(0);
+ if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
+ SDValue SrcOp = SrcBC.getOperand(0);
+ if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, SrcOp);
+ }
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
- if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
+ if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
@@ -33704,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
: DAG.getConstant(0, dl, VT);
SDValue SrcOp = Ops[SrcIdx / Mask.size()];
- SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SrcIdx = SrcIdx % Mask.size();
// We can only extract other elements from 128-bit vectors and in certain
@@ -33714,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
assert(SrcSVT == VT && "Unexpected extraction type");
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
}
@@ -33723,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
"Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
@@ -33731,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Extracting a scalar FP value from vector element 0 is free, so extract each
+/// operand first, then perform the math as a scalar op.
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
+ SDValue Vec = ExtElt->getOperand(0);
+ SDValue Index = ExtElt->getOperand(1);
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = Vec.getValueType();
+
+ // TODO: If this is a unary/expensive/expand op, allow extraction from a
+ // non-zero element because the shuffle+scalar op will be cheaper?
+ if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
+ return SDValue();
+
+ // Vector FP compares don't fit the pattern of FP math ops (propagate, not
+ // extract, the condition code), so deal with those as a special-case.
+ if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
+ EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
+ if (OpVT != MVT::f32 && OpVT != MVT::f64)
+ return SDValue();
+
+ // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(1), Index);
+ return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
+ }
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Vector FP selects don't fit the pattern of FP math ops (because the
+ // condition has a different type and we have to change the opcode), so deal
+ // with those here.
+ // FIXME: This is restricted to pre type legalization by ensuring the setcc
+ // has i1 elements. If we loosen this we need to convert vector bool to a
+ // scalar bool.
+ if (Vec.getOpcode() == ISD::VSELECT &&
+ Vec.getOperand(0).getOpcode() == ISD::SETCC &&
+ Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
+ Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
+ // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Vec.getOperand(0).getValueType().getScalarType(),
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(1), Index);
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(2), Index);
+ return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
+ }
+
+ // TODO: This switch could include FNEG and the x86-specific FP logic ops
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // missed load folding and fma+fneg combining.
+ switch (Vec.getOpcode()) {
+ case ISD::FMA: // Begin 3 operands
+ case ISD::FMAD:
+ case ISD::FADD: // Begin 2 operands
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCOPYSIGN:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case ISD::FABS: // Begin 1 operand
+ case ISD::FSQRT:
+ case ISD::FRINT:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case X86ISD::FRCP:
+ case X86ISD::FRSQRT: {
+ // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
+ SDLoc DL(ExtElt);
+ SmallVector<SDValue, 4> ExtOps;
+ for (SDValue Op : Vec->ops())
+ ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
+ return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
+ }
+ default:
+ return SDValue();
+ }
+ llvm_unreachable("All opcodes should return within switch");
+}
+
+/// Try to convert a vector reduction sequence composed of binops and shuffles
+/// into horizontal ops.
+static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ if (!isNullConstant(Index))
+ return SDValue();
+
+ // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
+ ISD::NodeType Opc;
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ if (!Rdx)
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = ExtElt->getOperand(0).getValueType();
+ if (VecVT.getScalarType() != VT)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+ SDLoc DL(ExtElt);
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
+ VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -33741,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
+ SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
+ SDLoc dl(InputVector);
+ bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+
+ if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+
+ // Integer Constant Folding.
+ if (CIdx && VT.isInteger()) {
+ APInt UndefVecElts;
+ SmallVector<APInt, 16> EltBits;
+ unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
+ if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
+ EltBits, true, false)) {
+ uint64_t Idx = CIdx->getZExtValue();
+ if (UndefVecElts[Idx])
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+ return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
+ dl, VT);
+ }
+ }
+
// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in:
// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
// combineBasicSADPattern.
- if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ if (IsPextr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(
+ SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
+ return SDValue(N, 0);
return SDValue();
+ }
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
- SDValue InputVector = N->getOperand(0);
- SDValue EltIdx = N->getOperand(1);
-
- EVT SrcVT = InputVector.getValueType();
- EVT VT = N->getValueType(0);
- SDLoc dl(InputVector);
-
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
@@ -33778,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
- if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
- isa<ConstantSDNode>(EltIdx) &&
- isa<ConstantSDNode>(InputVector.getOperand(0))) {
- uint64_t ExtractedElt = N->getConstantOperandVal(1);
- auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
- const APInt &InputValue = InputC->getAPIntValue();
- uint64_t Res = InputValue[ExtractedElt];
- return DAG.getConstant(Res, dl, MVT::i1);
- }
-
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
@@ -33802,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
+ if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = scalarizeExtEltFP(N, DAG))
+ return V;
+
+ // Attempt to extract a i1 element by using MOVMSK to extract the signbits
+ // and then testing the relevant element.
+ if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ SmallVector<SDNode *, 16> BoolExtracts;
+ auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+ if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getValueType(0) == MVT::i1) {
+ BoolExtracts.push_back(Use);
+ return true;
+ }
+ return false;
+ };
+ if (all_of(InputVector->uses(), IsBoolExtract) &&
+ BoolExtracts.size() > 1) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
+ if (SDValue BC =
+ combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
+ for (SDNode *Use : BoolExtracts) {
+ // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
+ unsigned MaskIdx = Use->getConstantOperandVal(1);
+ APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
+ SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
+ Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
+ DCI.CombineTo(Use, Res);
+ }
+ return SDValue(N, 0);
+ }
+ }
+ }
+
return SDValue();
}
@@ -33825,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
// Check if the first operand is all zeros and Cond type is vXi1.
// This situation only applies to avx512.
- if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
- CondVT.getVectorElementType() == MVT::i1) {
+ // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
+ // TODO: Can we assert that both operands are not zeros (because that should
+ // get simplified at node creation time)?
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+ if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
@@ -33844,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
// Try to invert the condition if true value is not all 1s and false value is
- // not all 0s.
- if (!TValIsAllOnes && !FValIsAllZeros &&
+ // not all 0s. Only do this if the condition has one use.
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
// Check if the selector will be produced by CMPP*/PCMP*.
Cond.getOpcode() == ISD::SETCC &&
// Check if SETCC has already been promoted.
@@ -33907,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// If both arms of a vector select are concatenated vectors, split the select,
+/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
+/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
+/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
+static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
+ return SDValue();
+
+ // TODO: Split 512-bit vectors too?
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ // TODO: Split as long as any 2 of the 3 operands are concatenated?
+ SDValue Cond = N->getOperand(0);
+ SDValue TVal = N->getOperand(1);
+ SDValue FVal = N->getOperand(2);
+ SmallVector<SDValue, 4> CatOpsT, CatOpsF;
+ if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
+ !collectConcatOps(TVal.getNode(), CatOpsT) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF))
+ return SDValue();
+
+ auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
+ makeBlend, /*CheckBWI*/ false);
+}
+
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
@@ -33973,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
-/// This function will also call SimplfiyDemandedBits on already created
+/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34268,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
+ // AVX512 - Extend select with zero to merge with target shuffle.
+ // select(mask, extract_subvector(shuffle(x)), zero) -->
+ // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+ // TODO - support non target shuffles as well.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ auto SelectableOp = [&TLI](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isTargetShuffle(Op.getOperand(0).getOpcode()) &&
+ isNullConstant(Op.getOperand(1)) &&
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+ Op.hasOneUse() && Op.getOperand(0).hasOneUse();
+ };
+
+ bool SelectableLHS = SelectableOp(LHS);
+ bool SelectableRHS = SelectableOp(RHS);
+ bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
+ EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
+ : RHS.getOperand(0).getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
+ LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
+ DAG.getUNDEF(SrcCondVT), Cond,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
+ return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
+ }
+ }
+
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
@@ -34338,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > C-1 ? x+-C : 0 --> subus x, C
- // TODO: Handle build_vectors with undef elements.
auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
+ return (!Op && !Cond) ||
+ (Op && Cond &&
+ Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
};
if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
- OpRHS = DAG.getNode(ISD::SUB, DL, VT,
- DAG.getConstant(0, DL, VT), OpRHS);
+ ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
+ /*AllowUndefs*/ true)) {
+ OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ OpRHS);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
}
@@ -34432,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;
+ if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
+ return V;
+
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -34715,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
// When legalizing carry, we create carries via add X, -1
// If that comes from an actual carry, via setcc, we use the
// carry directly.
-static SDValue combineCarryThroughADD(SDValue EFLAGS) {
+static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
SDValue Carry = EFLAGS.getOperand(0);
@@ -34728,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) {
Carry = Carry.getOperand(0);
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
- if (Carry.getConstantOperandVal(0) == X86::COND_B)
- return Carry.getOperand(1);
+ // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
+ uint64_t CarryCC = Carry.getConstantOperandVal(0);
+ SDValue CarryOp1 = Carry.getOperand(1);
+ if (CarryCC == X86::COND_B)
+ return CarryOp1;
+ if (CarryCC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp
+ // instruction cannot take an immediate as its first operand.
+ //
+ if (CarryOp1.getOpcode() == X86ISD::SUB &&
+ CarryOp1.getNode()->hasOneUse() &&
+ CarryOp1.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
+ SDValue SubCommute =
+ DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
+ CarryOp1.getOperand(1), CarryOp1.getOperand(0));
+ return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
+ }
+ }
+ // If this is a check of the z flag of an add with 1, switch to the
+ // C flag.
+ if (CarryCC == X86::COND_E &&
+ CarryOp1.getOpcode() == X86ISD::ADD &&
+ isOneConstant(CarryOp1.getOperand(1)))
+ return CarryOp1;
}
}
}
@@ -34744,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
- if (SDValue Flags = combineCarryThroughADD(EFLAGS))
+ if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
@@ -34763,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
+ // cmov X, X, ?, ? --> X
+ if (TrueOp == FalseOp)
+ return TrueOp;
+
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
@@ -35044,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
- bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
@@ -35283,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
@@ -35352,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
- if (DAG.getMachineFunction().getFunction().optForMinSize())
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
@@ -35489,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt Mask = N0.getConstantOperandAPInt(1);
Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
@@ -35638,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (N->getOpcode() == ISD::SHL)
- if (SDValue V = combineShiftLeft(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRA)
- if (SDValue V = combineShiftRightArithmetic(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRL)
- if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
- return V;
-
- return SDValue();
-}
-
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -35677,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Constant Folding.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
- if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
- (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
+ if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
+ (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
@@ -35750,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Attempt to combine as shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35766,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
X86ISD::VSRL == N->getOpcode()) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
// Shift zero -> zero.
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
+ // Detect constant shift amounts.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
+ unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
+ return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
+ EltBits[0].getZExtValue(), DAG);
+ }
+
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
@@ -35829,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -35864,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- assert(
- ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
- (N->getOpcode() == X86ISD::PINSRW &&
- N->getValueType(0) == MVT::v8i16)) &&
- "Unexpected vector insertion");
+ EVT VT = N->getValueType(0);
+ assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+ "Unexpected vector insertion");
+
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35894,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- SDValue CMP0 = N0->getOperand(1);
- SDValue CMP1 = N1->getOperand(1);
+ SDValue CMP0 = N0.getOperand(1);
+ SDValue CMP1 = N1.getOperand(1);
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
@@ -35987,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
@@ -35996,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return SDValue();
SDValue X, Y;
- SDValue N0 = peekThroughBitcasts(N->getOperand(0));
- SDValue N1 = peekThroughBitcasts(N->getOperand(1));
- if (N0.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
- X = N0.getOperand(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (SDValue Not = IsNOT(N0, DAG)) {
+ X = Not;
Y = N1;
- } else if (N1.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
- X = N1.getOperand(0);
+ } else if (SDValue Not = IsNOT(N1, DAG)) {
+ X = Not;
Y = N0;
} else
return SDValue();
@@ -36046,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The type of the truncated inputs.
- if (N0->getOperand(0).getValueType() != VT)
+ if (N0.getOperand(0).getValueType() != VT)
return SDValue();
// The right side has to be a 'trunc' or a constant vector.
@@ -36062,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0->getOperand(0);
+ N0 = N0.getOperand(0);
if (RHSTrunc)
- N1 = N1->getOperand(0);
+ N1 = N1.getOperand(0);
else
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
@@ -36088,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- unsigned FPOpcode = ISD::DELETED_NODE;
- if (N->getOpcode() == ISD::AND)
- FPOpcode = X86ISD::FAND;
- else if (N->getOpcode() == ISD::OR)
- FPOpcode = X86ISD::FOR;
- else if (N->getOpcode() == ISD::XOR)
- FPOpcode = X86ISD::FXOR;
-
- assert(FPOpcode != ISD::DELETED_NODE &&
- "Unexpected input node for FP logic conversion");
-
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- ((Subtarget.hasSSE1() && VT == MVT::i32) ||
- (Subtarget.hasSSE2() && VT == MVT::i64))) {
- SDValue N00 = N0.getOperand(0);
- SDValue N10 = N1.getOperand(0);
- EVT N00Type = N00.getValueType();
- EVT N10Type = N10.getValueType();
- if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
- SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
- return DAG.getBitcast(VT, FPLogic);
- }
+
+ if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+
+ // Ensure that both types are the same and are legal scalar fp types.
+ if (N00Type != N10Type ||
+ !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+ (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+ return SDValue();
+
+ unsigned FPOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
}
- return SDValue();
+
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
}
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
@@ -36371,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineParity(N, DAG, Subtarget))
return V;
+ // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (Mask) {
+ APInt AllBits = APInt::getAllOnesValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+ }
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -36392,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36440,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
+static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
+
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
+ SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N->getOperand(1));
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // On XOP we'll lower to PCMOV so accept one use, otherwise only
+ // do this if either mask has multiple uses already.
+ if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
+ !N1.getOperand(1).hasOneUse()))
+ return SDValue();
+
+ // Attempt to extract constant byte masks.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
+ false, false))
+ return SDValue();
+ if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
+ false, false))
+ return SDValue();
+
+ for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
+ // TODO - add UNDEF elts support.
+ if (UndefElts0[i] || UndefElts1[i])
+ return SDValue();
+ if (EltBits0[i] != ~EltBits1[i])
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ SDValue Y =
+ DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
+ DAG.getBitcast(VT, N1.getOperand(0)));
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
+}
+
// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
if (N->getOpcode() != ISD::OR)
@@ -36472,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
return true;
}
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of vselect:
+// (vselect M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
+ return SDValue();
+
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
+}
+
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -36507,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
- // Try to match:
- // (or (and (M, (sub 0, X)), (pandn M, X)))
- // which is a special case of vselect:
- // (vselect M, (sub 0, X), X)
- // Per:
- // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
- // We know that, if fNegate is 0 or 1:
- // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
- //
- // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
- // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
- // ( M ? -X : X) == ((X ^ M ) + (M & 1))
- // This lets us transform our vselect to:
- // (add (xor X, M), (and M, 1))
- // And further to:
- // (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
- DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
- auto IsNegV = [](SDNode *N, SDValue V) {
- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
- };
- SDValue V;
- if (IsNegV(Y.getNode(), X))
- V = X;
- else if (IsNegV(X.getNode(), Y))
- V = Y;
-
- if (V) {
- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
- SDValue SubOp2 = Mask;
-
- // If the negate was on the false side of the select, then
- // the operands of the SUB need to be swapped. PR 27251.
- // This is because the pattern being matched above is
- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
- // but if the pattern matched was
- // (vselect M, X, (sub (0, X))), that is really negation of the pattern
- // above, -(vselect M, (sub 0, X), X), and therefore the replacement
- // pattern also needs to be a negation of the replacement pattern above.
- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
- // sub accomplishes the negation of the replacement pattern.
- if (V == Y)
- std::swap(SubOp1, SubOp2);
-
- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
- return DAG.getBitcast(VT, Res);
- }
- }
+ // Attempt to combine to conditional negate: (sub (xor X, M), M)
+ if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
+ DAG, Subtarget))
+ return Res;
// PBLENDVB is only available on SSE 4.1.
if (!Subtarget.hasSSE41())
@@ -36665,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
- EVT VT = OR->getValueType(0);
- SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
if (!NewRHS)
return SDValue();
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
@@ -36702,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36718,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
@@ -36747,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
SDValue ShMsk0;
if (ShAmt0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
- ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk0 = ShAmt0;
ShAmt0 = ShAmt0.getOperand(0);
}
SDValue ShMsk1;
if (ShAmt1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
- ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1;
ShAmt1 = ShAmt1.getOperand(0);
}
@@ -36765,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
ShAmt1 = ShAmt1.getOperand(0);
SDLoc DL(N);
- unsigned Opc = X86ISD::SHLD;
+ unsigned Opc = ISD::FSHL;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB ||
- ShAmt0.getOpcode() == ISD::XOR) {
- Opc = X86ISD::SHRD;
+ if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
+ Opc = ISD::FSHR;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
std::swap(ShMsk0, ShMsk1);
}
- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
+ auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
+ SDValue Amt) {
+ if (Opc == ISD::FSHR)
+ std::swap(Op0, Op1);
+ return DAG.getNode(Opc, DL, VT, Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
+ };
+
+ // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
if (ShAmt1.getOpcode() == ISD::SUB) {
SDValue Sum = ShAmt1.getOperand(0);
if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
+ ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
+ ShMsk1 = ShAmt1Op1;
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ }
if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
if ((SumC->getAPIntValue() == Bits ||
(SumC->getAPIntValue() == 0 && ShMsk1)) &&
ShAmt1Op1 == ShAmt0)
- return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
}
} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
- return DAG.getNode(Opc, DL, VT,
- N0.getOperand(0), N1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL,
- MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
} else if (ShAmt1.getOpcode() == ISD::XOR) {
SDValue Mask = ShAmt1.getOperand(1);
if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
- unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+ unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
ShAmt1Op0 = ShAmt1Op0.getOperand(0);
@@ -36812,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandVal(1) == 1) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ Op1.getConstantOperandAPInt(1) == 1) {
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
Op1.getOperand(0) == Op1.getOperand(1)) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
}
}
@@ -36862,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
// Make sure the shift amount extracts the sign bit.
if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
- Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1.
@@ -36915,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The shift should be smearing the sign bit across each vector element.
- auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
- if (!ShiftBV)
- return SDValue();
-
- EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
- auto *ShiftAmt = ShiftBV->getConstantSplatNode();
- if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ auto *ShiftAmt =
+ isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
+ if (!ShiftAmt ||
+ ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1. We don't use the more obvious
@@ -37203,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
AVGBuilder);
}
- if (Operands[0].getOpcode() == ISD::ADD)
+ // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
+ // Match the or case only if its 'add-like' - can be replaced by an add.
+ auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
+ if (ISD::ADD == V.getOpcode()) {
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ }
+ if (ISD::ZERO_EXTEND != V.getOpcode())
+ return false;
+ V = V.getOperand(0);
+ if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
+ !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
+ return false;
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ };
+
+ SDValue Op0, Op1;
+ if (FindAddLike(Operands[0], Op0, Op1))
std::swap(Operands[0], Operands[1]);
- else if (Operands[1].getOpcode() != ISD::ADD)
+ else if (!FindAddLike(Operands[1], Op0, Op1))
return SDValue();
- Operands[2] = Operands[1].getOperand(0);
- Operands[1] = Operands[1].getOperand(1);
+ Operands[2] = Op0;
+ Operands[1] = Op1;
// Now we have three operands of two additions. Check that one of them is a
- // constant vector with ones, and the other two are promoted from i8/i16.
+ // constant vector with ones, and the other two can be promoted from i8/i16.
for (int i = 0; i < 3; ++i) {
if (!IsConstVectorInRange(Operands[i], 1, 1))
continue;
@@ -37219,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// Check if Operands[0] and Operands[1] are results of type promotion.
for (int j = 0; j < 2; ++j)
- if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
- Operands[j].getOperand(0).getValueType() != VT)
- return SDValue();
+ if (Operands[j].getValueType() != VT) {
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+ Operands[j] = Operands[j].getOperand(0);
+ }
// The pattern is detected, emit X86ISD::AVG instruction(s).
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Operands[0].getOperand(0),
- Operands[1].getOperand(0) }, AVGBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
+ AVGBuilder);
}
return SDValue();
@@ -37246,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
- unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
- AddressSpace, Alignment, &Fast) && !Fast))) {
+ *Ld->getMemOperand(), &Fast) &&
+ !Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Ptr = Ld->getBasePtr();
-
+ unsigned HalfAlign = 16;
+ SDValue Ptr1 = Ld->getBasePtr();
+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- NumElems/2);
+ NumElems / 2);
SDValue Load1 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
Alignment, Ld->getMemOperand()->getFlags());
-
- Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
- SDValue Load2 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
+ Ld->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- Load1.getValue(1),
- Load2.getValue(1));
+ Load1.getValue(1), Load2.getValue(1));
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
return DCI.CombineTo(N, NewVec, TF, true);
}
+ // Bool vector load - attempt to cast to an integer, as we have good
+ // (vXiY *ext(vXi1 bitcast(iX))) handling.
+ if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
+ RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
+ unsigned NumElts = RegVT.getVectorNumElements();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ if (TLI.isTypeLegal(IntVT)) {
+ SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Alignment,
+ Ld->getMemOperand()->getFlags());
+ SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
+ return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
+ }
+ }
+
return SDValue();
}
@@ -37404,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
if (ML->getPassThru().isUndef())
return SDValue();
+ if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
+ return SDValue();
+
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
@@ -37434,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend;
}
- if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ if (Mld->getExtensionType() != ISD::EXTLOAD)
return SDValue();
// Resolve extending loads.
@@ -37504,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WidePassThru,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
- SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
- return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ SlicedVec = DAG.getBitcast(VT, SlicedVec);
+
+ return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
}
/// If exactly one element of the mask is set for a non-truncating masked store,
@@ -37543,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
if (!Mst->isTruncatingStore()) {
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
return ScalarStore;
@@ -37551,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
@@ -37561,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// pattern above, but that pattern will be different. It will either need to
// match setcc more generally or match PCMPGTM later (in tablegen?).
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ }
+
return SDValue();
}
// Resolve truncating stores.
unsigned NumElems = VT.getVectorNumElements();
- EVT StVT = Mst->getMemoryVT();
- SDLoc dl(Mst);
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
unsigned ToSz = StVT.getScalarSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
@@ -37644,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
+ unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -37699,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
- unsigned Alignment = St->getAlignment();
-
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
@@ -37724,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
bool Fast;
- unsigned AddressSpace = St->getAddressSpace();
- unsigned Alignment = St->getAlignment();
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AddressSpace, Alignment, &Fast) &&
+ *St->getMemOperand(), &Fast) &&
!Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
- SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
+ return splitVectorStore(St, DAG);
+ }
- SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+ // Split under-aligned vector non-temporal stores.
+ if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+ // vectors or the legalizer can scalarize it to use MOVNTI.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+ return splitVectorStore(St, DAG);
+ }
+
+ // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+ // to use MOVNTI.
+ if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+ MVT NTVT = Subtarget.hasSSE4A()
+ ? MVT::v2f64
+ : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+ return scalarizeVectorStore(St, NTVT, DAG);
+ }
+ }
- SDValue Ch0 =
- DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
- Alignment, St->getMemOperand()->getFlags());
- SDValue Ch1 =
- DAG.getStore(St->getChain(), dl, Value1, Ptr1,
- St->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ // Try to optimize v16i16->v16i8 truncating stores when BWI is not
+ // supported, but avx512f is by extending to v16i32 and truncating.
+ if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
+ St->getValue().getOpcode() == ISD::TRUNCATE &&
+ St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
+ TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
+ !DCI.isBeforeLegalizeOps()) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v16i8, St->getMemOperand());
}
// Optimize trunc store (of multiple scalars) to shuffle and store.
@@ -37763,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SDValue Val =
detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
TLI))
@@ -37867,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if ((VT.isVector() ||
+ if (((VT.isVector() && !VT.isFloatingPoint()) ||
(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
@@ -37890,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
if (Subtarget.is64Bit() || F64IsLegal) {
- MVT LdVT = (Subtarget.is64Bit() &&
- (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
+ MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getMemOperand());
@@ -37965,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool IsCommutative) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
@@ -37979,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
- // At least one of the operands should be a vector shuffle.
- if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
- RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
- return false;
-
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO - can we make a general helper method that does all of this for us?
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool UseSubVector = false;
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getValueType().is256BitVector() &&
+ llvm::isNullConstant(Op.getOperand(1))) {
+ Op = Op.getOperand(0);
+ UseSubVector = true;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary)) {
+ if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
+ SrcOps.size() <= 2) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
+ SrcOps.size() == 1) {
+ N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
+ N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
+ ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ }
+ }
+ };
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
- unsigned NumElts = VT.getVectorNumElements();
SDValue A, B;
- SmallVector<int, 16> LMask(NumElts);
- if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!LHS.getOperand(0).isUndef())
- A = LHS.getOperand(0);
- if (!LHS.getOperand(1).isUndef())
- B = LHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
- llvm::copy(Mask, LMask.begin());
- } else {
- A = LHS;
- for (unsigned i = 0; i != NumElts; ++i)
- LMask[i] = i;
- }
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 16> RMask(NumElts);
- if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!RHS.getOperand(0).isUndef())
- C = RHS.getOperand(0);
- if (!RHS.getOperand(1).isUndef())
- D = RHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
- llvm::copy(Mask, RMask.begin());
- } else {
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
+ if (NumShuffles == 0)
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
- RMask[i] = i;
+ RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
@@ -38072,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+
+ if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
+ return false;
+
+ LHS = DAG.getBitcast(VT, LHS);
+ RHS = DAG.getBitcast(VT, RHS);
return true;
}
@@ -38088,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, IsFadd) &&
- shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
return SDValue();
@@ -38105,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
- unsigned Opcode = Src.getOpcode();
+ unsigned SrcOpcode = Src.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
@@ -38123,14 +40443,17 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return true;
// See if this is a single use constant which can be constant folded.
- SDValue BC = peekThroughOneUseBitcasts(Op);
- return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
+ // NOTE: We don't peek throught bitcasts here because there is currently
+ // no support for constant folding truncate+bitcast+vector_of_constants. So
+ // we'll just send up with a truncate on both operands which will
+ // get turned back into (truncate (binop)) causing an infinite loop.
+ return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
};
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
- return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
};
// Don't combine if the operation has other uses.
@@ -38145,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// In most cases its only worth pre-truncating if we're only facing the cost
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
- switch (Opcode) {
+ switch (SrcOpcode) {
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38160,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
- if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
- !TLI.isOperationLegal(Opcode, SrcVT))
+ if (SrcVT.getScalarType() == MVT::i64 &&
+ TLI.isOperationLegal(SrcOpcode, VT) &&
+ !TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38177,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38188,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
}
/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+/// e.g. trunc <8 x i32> X to <8 x i16> -->
+/// MaskX = X & 0xffff (clear high bits to prevent saturation)
+/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
- EVT InSVT = InVT.getVectorElementType();
EVT OutVT = N->getValueType(0);
- EVT OutSVT = OutVT.getVectorElementType();
-
- // Split a long vector into vectors of legal type and mask to unset all bits
- // that won't appear in the result to prevent saturation.
- // TODO - we should be doing this at the maximum legal size but this is
- // causing regressions where we're concatenating back to max width just to
- // perform the AND and then extracting back again.....
- unsigned NumSubRegs = InVT.getSizeInBits() / 128;
- unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
- EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
- SmallVector<SDValue, 8> SubVecs(NumSubRegs);
-
- APInt Mask =
- APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
- SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
-
- for (unsigned i = 0; i < NumSubRegs; i++) {
- SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
- DAG.getIntPtrConstant(i * NumSubRegElts, DL));
- SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
- }
- In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
+ APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
}
@@ -38580,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
+ unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
+
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
- auto VT = Op->getValueType(0);
+ EVT VT = Op->getValueType(0);
+ // Make sure the element size does't change.
+ if (VT.getScalarSizeInBits() != ScalarSize)
+ return SDValue();
+
if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
- SVOp->getMask());
+ if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
+ return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
+ SVOp->getMask());
return SDValue();
}
unsigned Opc = Op.getOpcode();
@@ -38601,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (!InsVector.isUndef())
return SDValue();
if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
- NegInsVal, Op.getOperand(2));
+ if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
+ NegInsVal, Op.getOperand(2));
return SDValue();
}
if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
return SDValue();
- SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
- if (!Op1.getValueType().isFloatingPoint())
- return SDValue();
-
- SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
@@ -38625,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
SmallVector<APInt, 16> EltBits;
// Extract constant bits and see if they are all sign bit masks. Ignore the
// undef elements.
- if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize,
UndefElts, EltBits,
/* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false)) {
@@ -38922,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: If an operand is already known to be a NaN or not a NaN, this
- // should be an optional swap and FMAX/FMIN.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
- if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
- (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64) ||
+ (VT.isVector() && TLI.isTypeLegal(VT))))
return SDValue();
SDValue Op0 = N->getOperand(0);
@@ -38941,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ // If one of the operands is known non-NaN use the native min/max instructions
+ // with the non-NaN input as second operand.
+ if (DAG.isKnownNeverNaN(Op1))
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ if (DAG.isKnownNeverNaN(Op0))
+ return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
+
// If we have to respect NaN inputs, this takes at least 3 instructions.
// Favor a library call when operating on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
- EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
- DAG.getDataLayout(), *DAG.getContext(), VT);
+ EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
@@ -38987,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
KnownZero, DCI))
return SDValue(N, 0);
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
return SDValue();
}
@@ -39005,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
- if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
- return DAG.getNode(ISD::AND, SDLoc(N), VT,
- N->getOperand(0).getOperand(0), N->getOperand(1));
- }
+ if (SDValue Not = IsNOT(N->getOperand(0), DAG))
+ return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
+ N->getOperand(1));
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -39039,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
// Try to combine sext_in_reg of a cmov of constants by extending the constants.
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT DstVT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
- if (ExtraVT != MVT::i16)
+ if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
return SDValue();
- // Look through single use any_extends.
- if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
+ // Look through single use any_extends / truncs.
+ SDValue IntermediateBitwidthOp;
+ if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
+ N0.hasOneUse()) {
+ IntermediateBitwidthOp = N0;
N0 = N0.getOperand(0);
+ }
// See if we have a single use cmov.
if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
@@ -39066,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
- // If we looked through an any_extend above, add one to the constants.
- if (N0.getValueType() != VT) {
- CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
- CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
+ // If we looked through an any_extend/trunc above, add one to the constants.
+ if (IntermediateBitwidthOp) {
+ unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
+ CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
+ CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
+ }
+
+ CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
+ CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
+
+ EVT CMovVT = DstVT;
+ // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
+ if (DstVT == MVT::i16) {
+ CMovVT = MVT::i32;
+ CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
}
- CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
- CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
+ SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
+ N0.getOperand(2), N0.getOperand(3));
- return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
- N0.getOperand(2), N0.getOperand(3));
+ if (CMovVT != DstVT)
+ CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
+
+ return CMov;
}
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
if (SDValue V = combineSextInRegCmov(N, DAG))
return V;
@@ -39336,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
return SDValue();
unsigned Opcode = N->getOpcode();
+ // TODO - add ANY_EXTEND support.
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
@@ -39382,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
- EVT InVT = N.getValueType();
- EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
- Size / InVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
- DAG.getUNDEF(InVT));
+ EVT SrcVT = N.getValueType();
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+ Size / SrcVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
+ DAG.getUNDEF(SrcVT));
Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
};
// If target-size is less than 128-bits, extend to a type that would extend
@@ -39410,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
(VT.is256BitVector() && Subtarget.hasAVX()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
- Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, ExOp);
}
@@ -39421,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
- unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
-
+ unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
@@ -39457,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Only do this combine with AVX512 for vector extends.
- if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
+ if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
return SDValue();
// Only combine legal element types.
@@ -39473,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
// that's the only integer compares with we have.
- ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
+ ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
if (ISD::isUnsignedIntSetCC(CC))
return SDValue();
@@ -39629,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
if (!NegVal)
return SDValue();
+ // FIXME: Should we bitcast instead?
+ if (NegVal.getValueType() != VT)
+ return SDValue();
+
unsigned NewOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
@@ -39705,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
return R;
+ // TODO: Combine with any target/faux shuffle.
+ if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
+ VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
+ unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
+ if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
+ (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
+ return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+ }
+ }
+
return SDValue();
}
@@ -39734,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();
- // Bail out if we know that this is not really just an oversized integer.
- if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
- peekThroughBitcasts(Y).getValueType() == MVT::f128)
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+ X.getOpcode() == ISD::LOAD;
+ };
+ if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+ !IsOrXorXorCCZero)
return SDValue();
// TODO: Use PXOR + PTEST for SSE4.1 or later?
@@ -39873,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
// Perform constant folding.
if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
- assert(VT== MVT::i32 && "Unexpected result type");
+ assert(VT == MVT::i32 && "Unexpected result type");
APInt Imm(32, 0);
for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
- SDValue In = Src.getOperand(Idx);
- if (!In.isUndef() &&
- cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
+ if (!Src.getOperand(Idx).isUndef() &&
+ Src.getConstantOperandAPInt(Idx).isNegative())
Imm.setBit(Idx);
}
return DAG.getConstant(Imm, SDLoc(N), VT);
}
// Look through int->fp bitcasts that don't change the element width.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() &&
- SrcVT.isFloatingPoint() &&
- Src.getOperand(0).getValueType() ==
- EVT(SrcVT).changeVectorElementTypeToInteger())
- Src = Src.getOperand(0);
+ unsigned EltWidth = SrcVT.getScalarSizeInBits();
+ if (Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
+ return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
+
+ // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
+ // with scalar comparisons.
+ if (SDValue NotSrc = IsNOT(Src, DAG)) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ NotSrc = DAG.getBitcast(SrcVT, NotSrc);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
+ DAG.getConstant(NotMask, DL, VT));
+ }
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
- // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
- // Only do this when the setcc input and output types are the same and the
- // setcc and the 'and' node have a single use.
- // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't.
- APInt SplatVal;
- if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
- Src.getOperand(0).getValueType() == Src.getValueType() &&
- cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE &&
- ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
- Src.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Src.getOperand(0);
- if (And.hasOneUse() &&
- ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isPowerOf2()) {
- MVT VT = Src.getSimpleValueType();
- unsigned BitWidth = VT.getScalarSizeInBits();
- unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1;
- SDLoc DL(And);
- SDValue X = And.getOperand(0);
- // If the element type is i8, we need to bitcast to i16 to use a legal
- // shift. If we wait until lowering we end up with an extra and to bits
- // from crossing the 8-bit elements, but we don't care about that here.
- if (VT.getVectorElementType() == MVT::i8) {
- VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
- X = DAG.getBitcast(VT, X);
- }
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(ShAmt, DL, VT));
- SDValue Cast = DAG.getBitcast(SrcVT, Shl);
- return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast);
- }
- }
-
return SDValue();
}
@@ -40065,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
- if (BuildVectorSDNode *BV =
- dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
@@ -40088,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
return SDValue();
}
+/// If we are converting a value to floating-point, try to replace scalar
+/// truncate of an extracted vector element with a bitcast. This tries to keep
+/// the sequence on XMM registers rather than moving between vector and GPRs.
+static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
+ // TODO: This is currently only used by combineSIntToFP, but it is generalized
+ // to allow being called by any similar cast opcode.
+ // TODO: Consider merging this into lowering: vectorizeExtractedCast().
+ SDValue Trunc = N->getOperand(0);
+ if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue ExtElt = Trunc.getOperand(0);
+ if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(ExtElt.getOperand(1)))
+ return SDValue();
+
+ EVT TruncVT = Trunc.getValueType();
+ EVT SrcVT = ExtElt.getValueType();
+ unsigned DestWidth = TruncVT.getSizeInBits();
+ unsigned SrcWidth = SrcVT.getSizeInBits();
+ if (SrcWidth % DestWidth != 0)
+ return SDValue();
+
+ // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
+ EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
+ unsigned VecWidth = SrcVecVT.getSizeInBits();
+ unsigned NumElts = VecWidth / DestWidth;
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
+ SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
+ SDLoc DL(N);
+ SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
+ BitcastVec, ExtElt.getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
+}
+
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
@@ -40181,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
return FILDChain;
}
}
+
+ if (SDValue V = combineToFPTruncExtElt(N, DAG))
+ return V;
+
return SDValue();
}
@@ -40267,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
onlyZeroFlagUsed(SDValue(N, 0))) {
- EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
- unsigned ShAmt = Op.getConstantOperandVal(1);
- if (ShAmt < BitWidth) { // Avoid undefined shifts.
+ const APInt &ShAmt = Op.getConstantOperandAPInt(1);
+ if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
+ unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
APInt Mask = Op.getOpcode() == ISD::SRL
- ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
- : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ ? APInt::getHighBitsSet(BitWidth, MaskBits)
+ : APInt::getLowBitsSet(BitWidth, MaskBits);
if (Mask.isSignedIntN(32)) {
Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -40283,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
}
}
-
// Look for a truncate with a single use.
if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
return SDValue();
@@ -40337,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
return Op.getValue(1);
}
+static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
+ "Expected X86ISD::ADD or X86ISD::SUB");
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ MVT VT = LHS.getSimpleValueType();
+ unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+
+ // If we don't use the flag result, simplify back to a generic ADD/SUB.
+ if (!N->hasAnyUseOfValue(1)) {
+ SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
+ return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
+ }
+
+ // Fold any similar generic ADD/SUB opcodes to reuse this node.
+ auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ SDValue Ops[] = {N0, N1};
+ SDVTList VTs = DAG.getVTList(N->getValueType(0));
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ SDValue Op(N, 0);
+ if (Negate)
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+ DCI.CombineTo(GenericAddSub, Op);
+ }
+ };
+ MatchGeneric(LHS, RHS, false);
+ MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ return SDValue();
+}
+
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
@@ -40346,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
Flags);
}
+ // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
+ // iff the flag result is dead.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+ !N->hasAnyUseOfValue(1))
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
+ Op0.getOperand(1), N->getOperand(2));
+
return SDValue();
}
@@ -40372,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, Res1, CarryOut);
}
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
@@ -40468,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
@@ -40575,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
auto BuildPMADDWD = [&](SDValue Mul) {
@@ -40631,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return SDValue();
// We know N is a reduction add, which means one of its operands is a phi.
- // To match SAD, we need the other operand to be a vector select.
- if (Op0.getOpcode() != ISD::VSELECT)
+ // To match SAD, we need the other operand to be a ABS.
+ if (Op0.getOpcode() != ISD::ABS)
std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::VSELECT)
+ if (Op0.getOpcode() != ISD::ABS)
return SDValue();
auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
@@ -40673,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
Op0 = BuildPSADBW(SadOp0, SadOp1);
// It's possible we have a sad on the other side too.
- if (Op1.getOpcode() == ISD::VSELECT &&
+ if (Op1.getOpcode() == ISD::ABS &&
detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
Op1 = BuildPSADBW(SadOp0, SadOp1);
}
@@ -40815,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
PMADDBuilder);
}
-// Try to turn (add (umax X, C), -C) into (psubus X, C)
-static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // psubus is available in SSE2 for i8 and i16 vectors.
- if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
- !isPowerOf2_32(VT.getVectorNumElements()) ||
- !(VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16))
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- if (Op0.getOpcode() != ISD::UMAX)
- return SDValue();
-
- // The add should have a constant that is the negative of the max.
- // TODO: Handle build_vectors with undef elements.
- auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
- return Max->getAPIntValue() == (-Op->getAPIntValue());
- };
- if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
- return SDValue();
-
- SDLoc DL(N);
- return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
- Op0.getOperand(1));
-}
-
// Attempt to turn this pattern into PMADDWD.
// (mul (add (zext (build_vector)), (zext (build_vector))),
// (add (zext (build_vector)), (zext (build_vector)))
@@ -40957,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
- EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i16 &&
+ EVT OpVT = Ops[0].getValueType();
+ assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
- assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- InVT.getVectorNumElements() / 2);
+ OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
@@ -40990,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal adds from adds of shuffles.
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -41003,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;
- if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
- return V;
-
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -41110,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
- APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ const APInt &XorC = Op1.getConstantOperandAPInt(1);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
@@ -41124,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
@@ -41159,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Helper that combines an array of subvector ops as if they were the operands
+/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
+/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+
+ if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ if (llvm::all_of(Ops, [](SDValue Op) {
+ return ISD::isBuildVectorAllZeros(Op.getNode());
+ }))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ SDValue Op0 = Ops[0];
+
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
+ // Repeated subvectors.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+ // If this broadcast/subv_broadcast is inserted into both halves, use a
+ // larger broadcast/subv_broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
+ return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+
+ // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
+ if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
+ (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
+ Op0.getOperand(0),
+ DAG.getIntPtrConstant(0, DL)));
+
+ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Subtarget.hasAVX2() ||
+ (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ Op0.getOperand(0).getValueType() == VT.getScalarType())
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
+ }
+
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+
+ // Repeated opcode.
+ // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
+ // but it currently struggles with different vector widths.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOpcode() == Op0.getOpcode();
+ })) {
+ unsigned NumOps = Ops.size();
+ switch (Op0.getOpcode()) {
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFD:
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VPERMILPI:
+ // TODO - add support for vXf64/vXi64 shuffles.
+ if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
+ Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
+ Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
+ Op0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case X86ISD::PACKUS:
+ if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ }
+ }
+
+ // If we're inserting all zeros into the upper half, change this to
+ // an insert into an all zeros vector. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
+ DAG.getIntPtrConstant(0, DL));
+
+ return SDValue();
+}
+
+static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0).getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Don't do anything for i1 vectors.
+ if (VT.getVectorElementType() == MVT::i1)
+ return SDValue();
+
+ if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
+ SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+ if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
+ DCI, Subtarget))
+ return R;
+ }
+
+ return SDValue();
+}
+
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -41173,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
- unsigned IdxVal = N->getConstantOperandVal(2);
+ uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
- if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
- // Inserting zeros into zeros is a nop.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (Vec.isUndef() && SubVec.isUndef())
+ return DAG.getUNDEF(OpVT);
+
+ // Inserting undefs/zeros into zeros/undefs is a zero vector.
+ if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
+ (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
- unsigned Idx2Val = SubVec.getConstantOperandVal(2);
+ uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
SubVec.getOperand(1),
@@ -41197,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
- SubVec.getConstantOperandVal(1) == 0 &&
+ SubVec.getConstantOperandAPInt(1) == 0 &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
- if (Ins.getConstantOperandVal(2) == 0 &&
+ if (Ins.getConstantOperandAPInt(2) == 0 &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
}
-
- // If we're inserting a bitcast into zeros, rewrite the insert and move the
- // bitcast to the other side. This helps with detecting zero extending
- // during isel.
- // TODO: Is this useful for other indices than 0?
- if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
- MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
- unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
- MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
- SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
- DAG.getBitcast(NewVT, Vec),
- SubVec.getOperand(0), N->getOperand(2));
- return DAG.getBitcast(OpVT, Insert);
- }
}
// Stop here if this is an i1 vector.
@@ -41248,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
}
}
- // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
- // load:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr + 16), Elts/2)
- // --> load32 addr
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr + 32), Elts/2)
- // --> load64 addr
- // or a 16-byte or 32-byte broadcast:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr), Elts/2)
- // --> X86SubVBroadcast(load16 addr)
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr), Elts/2)
- // --> X86SubVBroadcast(load32 addr)
+ // Match concat_vector style patterns.
+ SmallVector<SDValue, 2> SubVectorOps;
+ if (collectConcatOps(N, SubVectorOps))
+ if (SDValue Fold =
+ combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
+ return Fold;
+
+ // If we are inserting into both halves of the vector, the starting vector
+ // should be undef. If it isn't, make it so. Only do this if the early insert
+ // has no other uses.
+ // TODO: Should this be a generic DAG combine?
+ // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
- if (isNullConstant(Vec.getOperand(2))) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
- bool Fast;
- unsigned Alignment = FirstLd->getAlignment();
- unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
- Subtarget, false))
- return Ld;
- }
- }
- // If lower/upper loads are the same and there's no other use of the lower
- // load, then splat the loaded value with a broadcast.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
- if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse())
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-
- // If this is subv_broadcast insert into both halves, use a larger
- // subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
- SubVec.getOperand(0));
-
- // If we're inserting all zeros into the upper half, change this to
- // an insert into an all zeros vector. We will match this to a move
- // with implicit upper bit zeroing during isel.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
- getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
- Vec.getOperand(2));
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
+ isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
+ Vec.hasOneUse()) {
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
+ Vec.getOperand(1), Vec.getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
+ N->getOperand(2));
+ }
- // If we are inserting into both halves of the vector, the starting
- // vector should be undef. If it isn't, make it so. Only do this if the
- // the early insert has no other uses.
- // TODO: Should this be a generic DAG combine?
- if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
- SubVec2, Vec.getOperand(2));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
- N->getOperand(2));
+ // If this is a broadcast insert into an upper undef, use a larger broadcast.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
- }
- }
+ return SDValue();
+}
+
+/// If we are extracting a subvector of a vector select and the select condition
+/// is composed of concatenated vectors, try to narrow the select width. This
+/// is a common pattern for AVX1 integer code because 256-bit selects may be
+/// legal, but there is almost no integer math/logic available for 256-bit.
+/// This function should only be called with legal types (otherwise, the calls
+/// to get simple value types will assert).
+static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
+ SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+ SmallVector<SDValue, 4> CatOps;
+ if (Sel.getOpcode() != ISD::VSELECT ||
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ return SDValue();
+
+ // Note: We assume simple value types because this should only be called with
+ // legal operations/types.
+ // TODO: This can be extended to handle extraction to 256-bits.
+ MVT VT = Ext->getSimpleValueType(0);
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
+ if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
+ return SDValue();
+
+ MVT WideVT = Ext->getOperand(0).getSimpleValueType();
+ MVT SelVT = Sel.getSimpleValueType();
+ assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
+ "Unexpected vector type with legal operations");
+
+ unsigned SelElts = SelVT.getVectorNumElements();
+ unsigned CastedElts = WideVT.getVectorNumElements();
+ unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+ if (SelElts % CastedElts == 0) {
+ // The select has the same or more (narrower) elements than the extract
+ // operand. The extraction index gets scaled by that factor.
+ ExtIdx *= (SelElts / CastedElts);
+ } else if (CastedElts % SelElts == 0) {
+ // The select has less (wider) elements than the extract operand. Make sure
+ // that the extraction index can be divided evenly.
+ unsigned IndexDivisor = CastedElts / SelElts;
+ if (ExtIdx % IndexDivisor != 0)
+ return SDValue();
+ ExtIdx /= IndexDivisor;
+ } else {
+ llvm_unreachable("Element count of simple vector types are not divisible?");
}
- return SDValue();
+ unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
+ unsigned NarrowElts = SelElts / NarrowingFactor;
+ MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
+ SDLoc DL(Ext);
+ SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
+ SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
+ SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
+ SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
+ return DAG.getBitcast(VT, NarrowSel);
}
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
@@ -41334,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Capture the original wide type in the likely case that we need to bitcast
// back to this type.
- EVT VT = N->getValueType(0);
+ if (!N->getValueType(0).isSimple())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
EVT WideVecVT = N->getOperand(0).getValueType();
SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41360,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- MVT OpVT = N->getSimpleValueType(0);
+ if (SDValue V = narrowExtractedVectorSelect(N, DAG))
+ return V;
+
SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
- if (OpVT.getScalarType() == MVT::i1)
- return DAG.getConstant(1, SDLoc(N), OpVT);
- return getOnesVector(OpVT, DAG, SDLoc(N));
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getConstant(1, SDLoc(N), VT);
+ return getOnesVector(VT, DAG, SDLoc(N));
}
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(
- OpVT, SDLoc(N),
- InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
+ VT, SDLoc(N),
+ InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+
+ // Try to move vector bitcast after extract_subv by scaling extraction index:
+ // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
+ // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
+ if (InVec.getOpcode() == ISD::BITCAST &&
+ InVec.getOperand(0).getValueType().isVector()) {
+ SDValue SrcOp = InVec.getOperand(0);
+ EVT SrcVT = SrcOp.getValueType();
+ unsigned SrcNumElts = SrcVT.getVectorNumElements();
+ unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+ if ((DestNumElts % SrcNumElts) == 0) {
+ unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+ if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
+ unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
+ EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+ SrcVT.getScalarType(), NewExtNumElts);
+ if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+ unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+ SDLoc DL(N);
+ SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+ SrcOp, NewIndex);
+ return DAG.getBitcast(VT, NewExtract);
+ }
+ }
+ }
+ }
+
+ // If we're extracting from a broadcast then we're better off just
+ // broadcasting to the smaller type directly, assuming this is the only use.
+ // As its a broadcast we don't care about the extraction index.
+ if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
+ InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
- if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
- return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ // v2f64 CVTUDQ2PD(v4i32).
+ if (InOpcode == ISD::UINT_TO_FP &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTPS2PD(v4f32).
if (InOpcode == ISD::FP_EXTEND &&
InVec.getOperand(0).getValueType() == MVT::v4f32) {
- return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
}
}
- if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
- OpVT.is128BitVector() &&
- InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- unsigned ExtOp =
- InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
- : ISD::SIGN_EXTEND_VECTOR_INREG;
- return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
- }
- if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ if ((InOpcode == ISD::ANY_EXTEND ||
+ InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::ZERO_EXTEND ||
+ InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
- OpVT.is128BitVector() &&
+ VT.is128BitVector() &&
InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
+ unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
+ return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
}
- if (InOpcode == ISD::BITCAST) {
- // TODO - do this for target shuffles in general.
- SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
- if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
- SDLoc DL(N);
- SDValue SubPSHUFB =
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
- extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
- extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
- return DAG.getBitcast(OpVT, SubPSHUFB);
- }
+ if (InOpcode == ISD::VSELECT &&
+ InVec.getOperand(0).getValueType().is256BitVector() &&
+ InVec.getOperand(1).getValueType().is256BitVector() &&
+ InVec.getOperand(2).getValueType().is256BitVector()) {
+ SDLoc DL(N);
+ SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
+ SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
+ SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
+ return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
}
@@ -41428,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
// This occurs frequently in our masked scalar intrinsic code and our
@@ -41436,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->getAPIntValue().isOneValue())
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
Src.getOperand(0));
// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
@@ -41445,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->isNullValue())
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
- Src.getOperand(0), Src.getOperand(1));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
+ Src.getOperand(1));
+
+ // Reduce v2i64 to v4i32 if we don't need the upper bits.
+ // TODO: Move to DAGCombine?
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
+ Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
+ Src.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
return SDValue();
}
@@ -41483,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Try to merge vector loads and extend_inreg to an extload.
+ if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
+ In.hasOneUse()) {
+ auto *Ld = cast<LoadSDNode>(In);
+ if (!Ld->isVolatile()) {
+ MVT SVT = In.getSimpleValueType().getVectorElementType();
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+ VT.getVectorNumElements());
+ if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
+ SDValue Load =
+ DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+ }
+
+ // Disabling for widening legalization for now. We can enable if we find a
+ // case that needs it. Otherwise it can be deleted when we switch to
+ // widening legalization.
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
+ if (In.getOpcode() == N->getOpcode() &&
+ TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
+
+ // Attempt to combine as a shuffle.
+ // TODO: SSE41 support
+ if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
+ SDValue Op(N, 0);
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -41494,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case ISD::CONCAT_VECTORS:
+ return combineConcatVectors(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
@@ -41506,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CMP: return combineCMP(N, DAG);
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::ADD:
+ case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
- case ISD::SHL:
- case ISD::SRA:
- case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::SHL: return combineShiftLeft(N, DAG);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
- case ISD::STORE: return combineStore(N, DAG, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
@@ -41535,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
- case X86ISD::CVTSI2P:
+ case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
+ Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
@@ -41624,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
- // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
- // we have specializations to turn 32-bit multiply into LEA or other ops.
+ // TODO: Almost no 8-bit ops are desirable because they have no actual
+ // size/speed advantages vs. 32-bit ops, but they do have a major
+ // potential disadvantage by causing partial register stalls.
+ //
+ // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+ // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
- if (Opc == ISD::MUL && VT == MVT::i8)
+ if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
@@ -41642,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
@@ -41717,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::ANY_EXTEND:
break;
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
@@ -41889,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
return false;
}
+static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
+ X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
+ .Case("{@cca}", X86::COND_A)
+ .Case("{@ccae}", X86::COND_AE)
+ .Case("{@ccb}", X86::COND_B)
+ .Case("{@ccbe}", X86::COND_BE)
+ .Case("{@ccc}", X86::COND_B)
+ .Case("{@cce}", X86::COND_E)
+ .Case("{@ccz}", X86::COND_E)
+ .Case("{@ccg}", X86::COND_G)
+ .Case("{@ccge}", X86::COND_GE)
+ .Case("{@ccl}", X86::COND_L)
+ .Case("{@ccle}", X86::COND_LE)
+ .Case("{@ccna}", X86::COND_BE)
+ .Case("{@ccnae}", X86::COND_B)
+ .Case("{@ccnb}", X86::COND_AE)
+ .Case("{@ccnbe}", X86::COND_A)
+ .Case("{@ccnc}", X86::COND_AE)
+ .Case("{@ccne}", X86::COND_NE)
+ .Case("{@ccnz}", X86::COND_NE)
+ .Case("{@ccng}", X86::COND_LE)
+ .Case("{@ccnge}", X86::COND_L)
+ .Case("{@ccnl}", X86::COND_GE)
+ .Case("{@ccnle}", X86::COND_G)
+ .Case("{@ccno}", X86::COND_NO)
+ .Case("{@ccnp}", X86::COND_P)
+ .Case("{@ccns}", X86::COND_NS)
+ .Case("{@cco}", X86::COND_O)
+ .Case("{@ccp}", X86::COND_P)
+ .Case("{@ccs}", X86::COND_S)
+ .Default(X86::COND_INVALID);
+ return Cond;
+}
+
/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(StringRef Constraint) const {
@@ -41949,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
}
}
- }
+ } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return C_Other;
return TargetLowering::getConstraintType(Constraint);
}
@@ -42120,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const {
return TargetLowering::LowerXConstraint(ConstraintVT);
}
+// Lower @cc targets via setcc.
+SDValue X86TargetLowering::LowerAsmOutputForConstraint(
+ SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
+ SelectionDAG &DAG) const {
+ X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
+ if (Cond == X86::COND_INVALID)
+ return SDValue();
+ // Check that return type is valid.
+ if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+ OpInfo.ConstraintVT.getSizeInBits() < 8)
+ report_fatal_error("Flag output operand is of invalid type");
+
+ // Get EFLAGS register. Only update chain when copyfrom is glued.
+ if (Flag.getNode()) {
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
+ Chain = Flag.getValue(1);
+ } else
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
+ // Extract CC code.
+ SDValue CC = getSETCC(Cond, Flag, DL, DAG);
+ // Extend to 32-bits
+ SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
+
+ return Result;
+}
+
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
@@ -42229,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'i': {
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
- // Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+ bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
+ BooleanContent BCont = getBooleanContents(MVT::i64);
+ ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
+ : ISD::SIGN_EXTEND;
+ int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
+ : CST->getSExtValue();
+ Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
break;
}
@@ -42242,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = nullptr;
- int64_t Offset = 0;
-
- // Match either (GA), (GA+C), (GA+C1+C2), etc.
- while (1) {
- if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
- Offset += GA->getOffset();
- break;
- } else if (Op.getOpcode() == ISD::ADD) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- } else if (Op.getOpcode() == ISD::SUB) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += -C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- }
-
- // Otherwise, this isn't something we can handle, reject it.
- return;
- }
-
- const GlobalValue *GV = GA->getGlobal();
- // If we require an extra load to get this address, as in PIC mode, we
- // can't accept it.
- if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
- return;
-
- Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
- GA->getValueType(0), Offset);
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(
+ Subtarget.classifyGlobalReference(GA->getGlobal())))
+ return;
break;
}
}
@@ -42307,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) {
RC.hasSuperClassEq(&X86::VR512RegClass);
}
+/// Check if \p RC is a mask register class.
+/// I.e., VK* or one of their variant.
+static bool isVKClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::VK1RegClass) ||
+ RC.hasSuperClassEq(&X86::VK2RegClass) ||
+ RC.hasSuperClassEq(&X86::VK4RegClass) ||
+ RC.hasSuperClassEq(&X86::VK8RegClass) ||
+ RC.hasSuperClassEq(&X86::VK16RegClass) ||
+ RC.hasSuperClassEq(&X86::VK32RegClass) ||
+ RC.hasSuperClassEq(&X86::VK64RegClass);
+}
+
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
@@ -42317,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
+ // 'A' means [ER]AX + [ER]DX.
+ case 'A':
+ if (Subtarget.is64Bit())
+ return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
+ assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
+ "Expecting 64, 32 or 16 bit subtarget");
+ return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
+
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'k':
if (Subtarget.hasAVX512()) {
- // Only supported in AVX512 or later.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32RegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16RegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8RegClass);
- case MVT::i1:
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1RegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16RegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32RegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64RegClass);
- }
}
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
@@ -42403,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
- if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+ if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
@@ -42431,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v4f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR256XRegClass);
- return std::make_pair(0U, &X86::VR256RegClass);
+ if (Subtarget.hasAVX())
+ return std::make_pair(0U, &X86::VR256RegClass);
+ break;
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
- return std::make_pair(0U, &X86::VR512RegClass);
+ if (!Subtarget.hasAVX512()) break;
+ if (VConstraint)
+ return std::make_pair(0U, &X86::VR512RegClass);
+ return std::make_pair(0U, &X86::VR512_0_15RegClass);
}
break;
}
@@ -42457,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(X86::XMM0, &X86::VR128RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
- if (Subtarget.hasAVX512()) { // Only supported in AVX512.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32WMRegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16WMRegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8WMRegClass);
- case MVT::i1:
+ if (Subtarget.hasAVX512()) {
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1WMRegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64WMRegClass);
- }
}
break;
}
}
+ if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return std::make_pair(0U, &X86::GR32RegClass);
+
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
@@ -42505,14 +45276,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
- // 'A' means [ER]AX + [ER]DX.
- if (Constraint == "A") {
- if (Subtarget.is64Bit())
- return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
- assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
- "Expecting 64, 32 or 16 bit subtarget");
- return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- }
+ // dirflag -> DF
+ if (StringRef("{dirflag}").equals_lower(Constraint))
+ return std::make_pair(X86::DF, &X86::DFCCRRegClass);
+
+ // fpsr -> FPSW
+ if (StringRef("{fpsr}").equals_lower(Constraint))
+ return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
+
return Res;
}
@@ -42561,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
- switch (Res.first) {
- case X86::EAX:
+ switch (DestReg) {
+ case X86::RAX:
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- case X86::EDX:
+ case X86::RDX:
return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
- case X86::ECX:
+ case X86::RCX:
return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
- case X86::EBX:
+ case X86::RBX:
return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
- case X86::ESI:
+ case X86::RSI:
return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
- case X86::EDI:
+ case X86::RDI:
return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
- case X86::EBP:
+ case X86::RBP:
return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
default:
return std::make_pair(0, nullptr);
@@ -42594,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
- Res.second = &X86::FR32RegClass;
+ Res.second = &X86::FR32XRegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
- Res.second = &X86::FR64RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
- Res.second = &X86::VR128RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
- Res.second = &X86::VR256RegClass;
+ Res.second = &X86::FR64XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
+ Res.second = &X86::VR128XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
+ Res.second = &X86::VR256XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
@@ -42608,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Res.first = 0;
Res.second = nullptr;
}
+ } else if (isVKClass(*Class)) {
+ if (VT == MVT::i1)
+ Res.second = &X86::VK1RegClass;
+ else if (VT == MVT::i8)
+ Res.second = &X86::VK8RegClass;
+ else if (VT == MVT::i16)
+ Res.second = &X86::VK16RegClass;
+ else if (VT == MVT::i32)
+ Res.second = &X86::VK32RegClass;
+ else if (VT == MVT::i64)
+ Res.second = &X86::VK64RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
}
return Res;
@@ -42660,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in X86MachineFunctionInfo.
X86MachineFunctionInfo *AFI =
- Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+ Entry->getParent()->getInfo<X86MachineFunctionInfo>();
AFI->setIsSplitCSR(true);
}
@@ -42688,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
+ assert(
+ Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
@@ -42709,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const {
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
-StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+StringRef
+X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();