diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/X86/X86ISelLowering.cpp | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) |
Notes
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 9548 |
1 files changed, 6168 insertions, 3380 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b6a692ee187d..0b4bf687e6cf 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addBypassSlowDiv(64, 32); } - if (Subtarget.isTargetKnownWindowsMSVC() || + if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setUseUnderscoreLongJmp(true); } + // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to + // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. + // FIXME: Should we be limitting the atomic size on other configs? Default is + // 1024. + if (!Subtarget.hasCmpxchg8b()) + setMaxAtomicSizeInBitsSupported(32); + // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); @@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i64 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } + setOperationAction(ISD::ABS , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { @@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); @@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + else + setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } + if (!Subtarget.is64Bit()) + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } @@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); + // Disable f32->f64 extload as we can only generate this in one instruction + // under optsize. So its easier to pattern match (fpext (load)) for that + // case instead of needing to emit 2 instructions for extload in the + // non-optsize case. + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); @@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); + setOperationAction(ISD::LROUND, MVT::f80, Expand); + setOperationAction(ISD::LLROUND, MVT::f80, Expand); + setOperationAction(ISD::LRINT, MVT::f80, Expand); + setOperationAction(ISD::LLRINT, MVT::f80, Expand); } // Always use a library call for pow. @@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); if (!ExperimentalVectorWideningLegalization) { // Use widening instead of promotion. @@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); - // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. - setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); - setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); @@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); - // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); } - if (ExperimentalVectorWideningLegalization) { - // These types need custom splitting if their input is a 128-bit vector. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - } + // These types need custom splitting if their input is a 128-bit vector. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); @@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); - // TODO - remove this once 256-bit X86ISD::ANDNP correctly split. - setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom); - // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); @@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } @@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) @@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); @@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE + // to 512-bit rather than use the AVX2 instructions so that we can use + // k-masks. if (!Subtarget.hasVLX()) { - // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE - // to 512-bit rather than use the AVX2 instructions so that we can use - // k-masks. for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); @@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Custom); } // Without BWI we need to use custom lowering to handle MVT::v64i8 input. @@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v8f64, Custom); - setOperationAction(ISD::SELECT, MVT::v8i64, Custom); - setOperationAction(ISD::SELECT, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v32i16, Custom); - setOperationAction(ISD::SELECT, MVT::v64i8, Custom); - setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't @@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. - if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium())) + if (Subtarget.is32Bit() && + (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) @@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); @@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) { - if (Size >= 16 && - (!Subtarget.isUnalignedMem16Slow() || - ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16)))) { +/// For vector ops we check that the overall size isn't larger than our +/// preferred vector width. +EVT X86TargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. - if (Size >= 32 && Subtarget.hasAVX()) { + if (Size >= 32 && Subtarget.hasAVX() && + (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, @@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // multiply) before we splat as a vector. return MVT::v32i8; } - if (Subtarget.hasSSE2()) + if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. - if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87())) + if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && + (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { @@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { return true; } -bool -X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool X86TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: @@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // TODO: What about AVX-512 (512-bit) accesses? } } + // NonTemporal vector memory ops must be aligned. + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + // NT loads can only be vector aligned, so if its less aligned than the + // minimum vector size (which we can split the vector down to), we might as + // well use a regular unaligned vector load. + // We don't have any NT loads pre-SSE41. + if (!!(Flags & MachineMemOperand::MOLoad)) + return (Align < 16 || !Subtarget.hasSSE41()); + return false; + } // Misaligned accesses of any size are always allowed. return true; } @@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. - auto *SecurityCheckCookie = cast<Function>( - M.getOrInsertFunction("__security_check_cookie", - Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()))); - SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); - SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { + F->setCallingConv(CallingConv::X86_FastCall); + F->addAttribute(1, Attribute::AttrKind::InReg); + } return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. @@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { return TargetLowering::getSDagStackGuard(M); } -Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { +Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { @@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "X86GenCallingConv.inc" - bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { @@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, "The values should reside in two registers"); SDValue Lo, Hi; - unsigned Reg; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); @@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. - Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); @@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: + // Swift: + case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); @@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, else ValVT = VA.getValVT(); - // Calculate SP offset of interrupt parameter, re-arrange the slot normally - // taken by a return address. - int Offset = 0; - if (CallConv == CallingConv::X86_INTR) { - // X86 interrupts may take one or two arguments. - // On the stack there will be no return address as in regular call. - // Offset of last argument need to be set to -4/-8 bytes. - // Where offset of the first argument out of two, should be set to 0 bytes. - Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); - if (Subtarget.is64Bit() && Ins.size() == 2) { - // The stack pointer needs to be realigned for 64 bit handlers with error - // code, so the argument offset changes by 8 bytes. - Offset += 8; - } - } - // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // can be improved with deeper analysis. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, /*isAliased=*/true); - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. - if (Flags.isCopyElisionCandidate()) { + // If the argument is passed directly in memory without any extension, then we + // can perform copy elision. Large vector types, for example, may be passed + // indirectly by pointer. + if (Flags.isCopyElisionCandidate() && + VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { @@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), - /*Immutable=*/false); + /*IsImmutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, @@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, MFI.setObjectSExt(FI, true); } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, @@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); - if (CallConv == CallingConv::X86_INTR) { - bool isLegal = Ins.size() == 1 || - (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || - (!Is64Bit && Ins[1].VT == MVT::i32))); - if (!isLegal) - report_fatal_error("X86 interrupts may take one or two arguments"); - } - // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments( } // Copy all forwards from physical to virtual registers. - for (ForwardedRegister &F : Forwards) { + for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); - Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); } } @@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); + MachineFunction::CallSiteInfo CSInfo; + if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); @@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. @@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (Callee->getOpcode() == ISD::GlobalAddress) { - // If the callee is a GlobalAddress node (quite common, every direct call - // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack - // it. - GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); - - // We should use extra load for direct calls to dllimported functions in - // non-JIT mode. - const GlobalValue *GV = G->getGlobal(); - if (!GV->hasDLLImportStorageClass()) { - unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); - - Callee = DAG.getTargetGlobalAddress( - GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - // Add a wrapper. - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - // Add extra indirection - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } - } - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlags = - Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); - - Callee = DAG.getTargetExternalSymbol( - S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } + } else if (Callee->getOpcode() == ISD::GlobalAddress || + Callee->getOpcode() == ISD::ExternalSymbol) { + // Lower direct calls to global addresses and external symbols. Setting + // ForCall to true here has the effect of removing WrapperRIP when possible + // to allow direct calls to be selected without first materializing the + // address into a register. + Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI @@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } if (HasNoCfCheck && IsCFProtectionSupported) { @@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; @@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!IntrData) return false; - Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags = MachineMemOperand::MONone; Info.offset = 0; @@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { + Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; @@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; break; } + case GATHER: + case GATHER_AVX2: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOLoad; + break; + } + case SCATTER: { + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOStore; + break; + } default: return false; } @@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; @@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + + // If this is an (1) AVX vector load with (2) multiple uses and (3) all of + // those uses are extracted directly into a store, then the extract + store + // can be store-folded. Therefore, it's probably not worth splitting the load. + EVT VT = Load->getValueType(0); + if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { + for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { + // Skip uses of the chain value. Result 0 of the node is the load value. + if (UI.getUse().getResNo() != 0) + continue; + + // If this use is not an extract + store, it's probably worth splitting. + if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || + UI->use_begin()->getOpcode() != ISD::STORE) + return true; + } + // All non-chain uses are extract + store. + return false; + } + return true; } @@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END) + return false; + // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); - if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT)) + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); - return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + +bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { + // TODO: Allow vectors? + if (VT.isVector()) + return false; + return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz() const { @@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, - EVT BitcastVT) const { +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; @@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; - return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); + // If both types are legal vectors, it's always ok to convert them. + if (LoadVT.isVector() && BitcastVT.isVector() && + isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) + return true; + + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, @@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } + // Make sure we don't merge greater than our preferred vector + // width. + if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) + return false; return true; } @@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const { return Subtarget.hasSSE2(); } -bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const { +bool X86TargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + EVT VT = N->getValueType(0); + if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || + (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { + // Only fold if the shift values are equal - so it folds to AND. + // TODO - we should fold if either is a non-uniform vector but we don't do + // the fold for non-splats yet. + return N->getOperand(1) == N->getOperand(0).getOperand(1); + } + return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); +} + +bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. @@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } -/// Return true if every element in Mask, beginning -/// from position Pos and ending in Pos+Size is the undef sentinel value. +/// Return true if every element in Mask, beginning from position Pos and ending +/// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (Mask[i] != SM_SentinelUndef) @@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { return true; } +/// Return true if the mask creates a vector whose lower half is undefined. +static bool isUndefLowerHalf(ArrayRef<int> Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, 0, NumElts / 2); +} + +/// Return true if the mask creates a vector whose upper half is undefined. +static bool isUndefUpperHalf(ArrayRef<int> Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, NumElts / 2, NumElts / 2); +} + /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); @@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, DAG.getIntPtrConstant(0, dl)); } +/// Widen a vector to a larger size with the same scalar type, with the new +/// elements either zero or undef. +static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, unsigned WideSizeInBits) { + assert(Vec.getValueSizeInBits() < WideSizeInBits && + (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && + "Unsupported vector widening type"); + unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); + MVT SVT = Vec.getSimpleValueType().getScalarType(); + MVT VT = MVT::getVectorVT(SVT, WideNumElts); + return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); +} + +// Helper function to collect subvector ops that are concated together, +// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. +// The subvectors in Ops are guaranteed to be the same type. +static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { + assert(Ops.empty() && "Expected an empty ops vector"); + + if (N->getOpcode() == ISD::CONCAT_VECTORS) { + Ops.append(N->op_begin(), N->op_end()); + return true; + } + + if (N->getOpcode() == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(2))) { + SDValue Src = N->getOperand(0); + SDValue Sub = N->getOperand(1); + const APInt &Idx = N->getConstantOperandAPInt(2); + EVT VT = Src.getValueType(); + EVT SubVT = Sub.getValueType(); + + // TODO - Handle more general insert_subvector chains. + if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && + Idx == (VT.getVectorNumElements() / 2) && + Src.getOpcode() == ISD::INSERT_SUBVECTOR && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + } + + return false; +} + // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in @@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } -// Return true if the instruction zeroes the unused upper part of the -// destination and accepts mask. -static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { - switch (Opcode) { - default: - return false; - case X86ISD::CMPM: - case X86ISD::CMPM_RND: - case ISD::SETCC: - return true; - } -} - /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, - SelectionDAG &DAG) { +// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. +static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + return ISD::ANY_EXTEND_VECTOR_INREG; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + return ISD::ZERO_EXTEND_VECTOR_INREG; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + return ISD::SIGN_EXTEND_VECTOR_INREG; + } + llvm_unreachable("Unknown opcode"); +} + +static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, + SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); + assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || + ISD::ZERO_EXTEND == Opcode) && + "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, InVT = In.getValueType(); } - if (VT.getVectorNumElements() == InVT.getVectorNumElements()) - return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, VT, In); + if (VT.getVectorNumElements() != InVT.getVectorNumElements()) + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); - return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); + return DAG.getNode(Opcode, DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } -// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops. -static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) { - while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) - V = V.getOperand(0); - return V; -} - -static const Constant *getTargetConstantFromNode(SDValue Op) { - Op = peekThroughBitcasts(Op); - - auto *Load = dyn_cast<LoadSDNode>(Op); - if (!Load) +static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { + if (!Load || !ISD::isNormalLoad(Load)) return nullptr; SDValue Ptr = Load->getBasePtr(); @@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { return CNode->getConstVal(); } +static const Constant *getTargetConstantFromNode(SDValue Op) { + Op = peekThroughBitcasts(Op); + return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)); +} + +const Constant * +X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { + assert(LD && "Unexpected null LoadSDNode"); + return getTargetConstantFromNode(LD); +} + // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, @@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; - APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); - EltBits[i] = Bits.getZExtValue(); + EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; @@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + // Extract constant bits from a subvector broadcast. + if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { + SmallVector<APInt, 16> SubEltBits; + if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, SubEltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + UndefElts = APInt::getSplat(NumElts, UndefElts); + while (EltBits.size() < NumElts) + EltBits.append(SubEltBits.begin(), SubEltBits.end()); + return true; + } + } + // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && @@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return CastBitData(UndefSrcElts, SrcEltBits); } + // Insert constant bits from a base and sub vector sources. + if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(Op.getOperand(2))) { + // TODO - support insert_subvector through bitcasts. + if (EltSizeInBits != VT.getScalarSizeInBits()) + return false; + + APInt UndefSubElts; + SmallVector<APInt, 32> EltSubBits; + if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, + UndefSubElts, EltSubBits, + AllowWholeUndefs, AllowPartialUndefs) && + getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, EltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + unsigned BaseIdx = Op.getConstantOperandVal(2); + UndefElts.insertBits(UndefSubElts, BaseIdx); + for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) + EltBits[BaseIdx + i] = EltSubBits[i]; + return true; + } + } + // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa<ConstantSDNode>(Op.getOperand(1))) { @@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, } } +// Split the demanded elts of a HADD/HSUB node between its operands. +static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, + APInt &DemandedLHS, APInt &DemandedRHS) { + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumEltsPerLane = NumElts / NumLanes; + int HalfEltsPerLane = NumEltsPerLane / 2; + + DemandedLHS = APInt::getNullValue(NumElts); + DemandedRHS = APInt::getNullValue(NumElts); + + // Map DemandedElts to the horizontal operands. + for (int Idx = 0; Idx != NumElts; ++Idx) { + if (!DemandedElts[Idx]) + continue; + int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; + int LocalIdx = Idx % NumEltsPerLane; + if (LocalIdx < HalfEltsPerLane) { + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } else { + LocalIdx -= HalfEltsPerLane; + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - const SelectionDAG &DAG); + SelectionDAG &DAG); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. -static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, +static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, + SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && - "Expected byte aligned value types"); + if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) + return false; + assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned Opcode = N.getOpcode(); switch (Opcode) { @@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::OR: { + // Inspect each operand at the byte level. We can merge these into a + // blend shuffle mask if for each byte at least one is masked out (zero). + KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); + KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); + if (Known0.One.isNullValue() && Known1.One.isNullValue()) { + bool IsByteMask = true; + unsigned NumSizeInBytes = NumSizeInBits / 8; + unsigned NumBytesPerElt = NumBitsPerElt / 8; + APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); + APInt SelectMask = APInt::getNullValue(NumBytesPerElt); + for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { + unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); + unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); + if (LHS == 255 && RHS == 0) + SelectMask.setBit(i); + else if (LHS == 255 && RHS == 255) + ZeroMask.setBit(i); + else if (!(LHS == 0 && RHS == 255)) + IsByteMask = false; + } + if (IsByteMask) { + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { + for (unsigned j = 0; j != NumBytesPerElt; ++j) { + unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); + int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); + Mask.push_back(Idx); + } + } + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + return true; + } + } + // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); @@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::INSERT_SUBVECTOR: { - // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where - // SRC0/SRC1 are both of the same valuetype VT. - // TODO - add peekThroughOneUseBitcasts support. SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); @@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, if (!isa<ConstantSDNode>(N.getOperand(2)) || !N->isOnlyUserOf(Sub.getNode())) return false; + uint64_t InsertIdx = N.getConstantOperandVal(2); + // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0).getValueType() == VT && + isa<ConstantSDNode>(Sub.getOperand(1))) { + uint64_t ExtractIdx = Sub.getConstantOperandVal(1); + for (int i = 0; i != (int)NumElts; ++i) + Mask.push_back(i); + for (int i = 0; i != (int)NumSubElts; ++i) + Mask[InsertIdx + i] = NumElts + ExtractIdx + i; + Ops.push_back(Src); + Ops.push_back(Sub.getOperand(0)); + return true; + } + // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector<int, 64> SubMask; SmallVector<SDValue, 2> SubInputs; - if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) || - SubMask.size() != NumSubElts) + if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG)) return false; + if (SubMask.size() != NumSubElts) { + assert(((SubMask.size() % NumSubElts) == 0 || + (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); + if ((NumSubElts % SubMask.size()) == 0) { + int Scale = NumSubElts / SubMask.size(); + SmallVector<int,64> ScaledSubMask; + scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask); + SubMask = ScaledSubMask; + } else { + int Scale = SubMask.size() / NumSubElts; + NumSubElts = SubMask.size(); + NumElts *= Scale; + InsertIdx *= Scale; + } + } Ops.push_back(Src); for (SDValue &SubInput : SubInputs) { - if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR || - SubInput.getOperand(0).getValueType() != VT || - !isa<ConstantSDNode>(SubInput.getOperand(1))) - return false; - Ops.push_back(SubInput.getOperand(0)); + EVT SubSVT = SubInput.getValueType().getScalarType(); + EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, + NumSizeInBits / SubSVT.getSizeInBits()); + Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, + DAG.getUNDEF(AltVT), SubInput, + DAG.getIntPtrConstant(0, SDLoc(N)))); } - int InsertIdx = N.getConstantOperandVal(2); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; - int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1); - M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts); + M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } @@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); + APInt EltsLHS, EltsRHS; + getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); + // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { - if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) || - (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)) + if ((!N0.isUndef() && + DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || + (!N1.isUndef() && + DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) || - (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask))) + if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || + (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) return false; } @@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, } return true; } - case ISD::ZERO_EXTEND_VECTOR_INREG: - case ISD::ZERO_EXTEND: { - // TODO - add support for VPMOVZX with smaller input vector types. + case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); - if (NumSizeInBits != SrcVT.getSizeInBits()) - break; - DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, + if (!SrcVT.isVector()) + return false; + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal broadcast type"); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumSizeInBits / SrcVT.getScalarSizeInBits()); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + + Ops.push_back(Src); + Mask.append(NumElts, 0); + return true; + } + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::ANY_EXTEND_VECTOR_INREG: { + SDValue Src = N.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // Extended source must be a simple vector. + if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || + (SrcVT.getScalarSizeInBits() % 8) != 0) + return false; + + unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); + bool IsAnyExtend = + (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); + DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, Mask); + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal zero-extension type"); + SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), + NumSizeInBits / NumSrcBitsPerElt); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + Ops.push_back(Src); return true; } @@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return false; } -/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask) { int MaskWidth = Mask.size(); @@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, M = SM_SentinelUndef; // Check for unused inputs. - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedInputs.push_back(Inputs[i]); + if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; continue; } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; + + // Check for repeated inputs. + bool IsRepeat = false; + for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { + if (UsedInputs[j] != Inputs[i]) + continue; + for (int &M : Mask) + if (lo <= M) + M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); + IsRepeat = true; + break; + } + if (IsRepeat) + continue; + + UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } @@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) + if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); @@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, Depth+1); } + // Recurse into insert_subvector base/sub vector to find scalars. + if (Opcode == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(2))) { + SDValue Vec = N->getOperand(0); + SDValue Sub = N->getOperand(1); + EVT SubVT = Sub.getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + uint64_t SubIdx = N->getConstantOperandVal(2); + + if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) + return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); + return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); + } + + // Recurse into extract_subvector src vector to find scalars. + if (Opcode == ISD::EXTRACT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(1))) { + SDValue Src = N->getOperand(0); + uint64_t SrcIdx = N->getConstantOperandVal(1); + return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); + } + // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); @@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register - // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. + // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) @@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } @@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDLoc dl(Op); SDValue V; - bool First = true; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. - for (unsigned i = 0; i < 16; ++i) { + for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; - if (ThisIsNonZero && First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; + if (!ThisIsNonZero && !NextIsNonZero) + continue; + + // FIXME: Investigate combining the first 4 bytes as a i32 instead. + SDValue Elt; + if (ThisIsNonZero) { + if (NumZero || NextIsNonZero) + Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else - V = DAG.getUNDEF(MVT::v8i16); - First = false; + Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } - if ((i & 1) != 0) { - // FIXME: Investigate extending to i32 instead of just i16. - // FIXME: Investigate combining the first 4 bytes as a i32 instead. - SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; - if (LastIsNonZero) { - LastElt = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); - } - if (ThisIsNonZero) { - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, - DAG.getConstant(8, dl, MVT::i8)); - if (LastIsNonZero) - ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); - } else - ThisElt = LastElt; - - if (ThisElt) { - if (1 == i) { - V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) - : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v8i16, V); - } else { - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i / 2, dl)); - } + if (NextIsNonZero) { + SDValue NextElt = Op.getOperand(i + 1); + if (i == 0 && NumZero) + NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); + else + NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); + NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, + DAG.getConstant(8, dl, MVT::i8)); + if (ThisIsNonZero) + Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); + else + Elt = NextElt; + } + + // If our first insertion is not the first index then insert into zero + // vector to break any register dependency else use SCALAR_TO_VECTOR. + if (!V) { + if (i != 0) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); + V = DAG.getBitcast(MVT::v8i16, V); + continue; } } + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, + DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); @@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, } // Find all zeroable elements. - std::bitset<4> Zeroable; - for (int i=0; i < 4; ++i) { - SDValue Elt = Op->getOperand(i); + std::bitset<4> Zeroable, Undefs; + for (int i = 0; i < 4; ++i) { + SDValue Elt = Op.getOperand(i); + Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && @@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; - for (unsigned i=0; i < 4; ++i) { + for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; - SDValue Elt = Op->getOperand(i); + SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa<ConstantSDNode>(Elt.getOperand(1))) return SDValue(); @@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. - SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + SDValue VZeroOrUndef = (Zeroable == Undefs) + ? DAG.getUNDEF(VT) + : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. @@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; - CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); + CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) @@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, unsigned NumElems = Elts.size(); int LastLoadedElt = -1; - SmallBitVector LoadMask(NumElems, false); - SmallBitVector ZeroMask(NumElems, false); - SmallBitVector UndefMask(NumElems, false); + APInt LoadMask = APInt::getNullValue(NumElems); + APInt ZeroMask = APInt::getNullValue(NumElems); + APInt UndefMask = APInt::getNullValue(NumElems); + + SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); + if (Elt.isUndef()) { + UndefMask.setBit(i); + continue; + } + if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { + ZeroMask.setBit(i); + continue; + } + + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + return SDValue(); - if (Elt.isUndef()) - UndefMask[i] = true; - else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) - ZeroMask[i] = true; - else if (ISD::isNON_EXTLoad(Elt.getNode())) { - LoadMask[i] = true; - LastLoadedElt = i; - // Each loaded element must be the correct fractional portion of the - // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) - return SDValue(); - } else + if (!ISD::isNON_EXTLoad(Elt.getNode())) return SDValue(); + + Loads[i] = cast<LoadSDNode>(Elt); + LoadMask.setBit(i); + LastLoadedElt = i; } - assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && + assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + + LoadMask.countPopulation()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. - if (UndefMask.count() == NumElems) + if (UndefMask.countPopulation() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? - if ((ZeroMask | UndefMask).count() == NumElems) + if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - int FirstLoadedElt = LoadMask.find_first(); + int FirstLoadedElt = LoadMask.countTrailingZeros(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); - LoadSDNode *LDBase = cast<LoadSDNode>(EltBase); - EVT LDBaseVT = EltBase.getValueType(); + EVT EltBaseVT = EltBase.getValueType(); + assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && + "Register/Memory size mismatch"); + LoadSDNode *LDBase = Loads[FirstLoadedElt]; + assert(LDBase && "Did not find base load for merging consecutive loads"); + unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); + unsigned BaseSizeInBytes = BaseSizeInBits / 8; + int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; + assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a @@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - SDValue Elt = peekThroughBitcasts(Elts[i]); - LoadSDNode *LD = cast<LoadSDNode>(Elt); - if (!DAG.areNonVolatileConsecutiveLoads( - LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, - i - FirstLoadedElt)) { + if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, + i - FirstLoadedElt)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; @@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - SmallVector<LoadSDNode *, 8> Loads; - for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) - if (LoadMask[i]) - Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i]))); - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && @@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, NewLd); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; - // LOAD - all consecutive load/undefs (must start/end with a load). - // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. - // If the vector contains zeros, then attempt to shuffle those elements. - if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + // Check if the base load is entirely dereferenceable. + bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( + VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); + + // LOAD - all consecutive load/undefs (must start/end with a load or be + // entirely dereferenceable). If we have found an entire vector of loads and + // undefs, then return a large load of the entire vector width starting at the + // base pointer. If the vector contains zeros, then attempt to shuffle those + // elements. + if (FirstLoadedElt == 0 && + (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { - assert(LDBase && "Did not find base load for merging consecutive loads"); - EVT EltVT = LDBase->getValueType(0); - // Ensure that the input vector size for the merged loads matches the - // cumulative size of the input elements. - if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) - return SDValue(); - if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); @@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); - if (IsConsecutiveLoad) + if (NumElems == 1) + return DAG.getBitcast(VT, Elts[FirstLoadedElt]); + + if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. - if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { + if (!isAfterLegalize && VT.isVector()) { SmallVector<int, 4> ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) @@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - int LoadSize = - (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); + // If the upper half of a ymm/zmm load is undef then just load the lower half. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned HalfNumElems = NumElems / 2; + if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { + EVT HalfVT = + EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); + SDValue HalfLD = + EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, + DAG, Subtarget, isAfterLegalize); + if (HalfLD) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), + HalfLD, DAG.getIntPtrConstant(0, DL)); + } + } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && - (LoadSize == 32 || LoadSize == 64) && + (LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) - : MVT::getIntegerVT(LoadSize); - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); + MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) + : MVT::getIntegerVT(LoadSizeInBits); + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, ResNode); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } + // BROADCAST - match the smallest possible repetition pattern, load that + // scalar/subvector element and then broadcast to the entire vector. + if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && + (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { + for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { + unsigned RepeatSize = SubElems * BaseSizeInBits; + unsigned ScalarSize = std::min(RepeatSize, 64u); + if (!Subtarget.hasAVX2() && ScalarSize < 32) + continue; + + bool Match = true; + SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); + for (unsigned i = 0; i != NumElems && Match; ++i) { + if (!LoadMask[i]) + continue; + SDValue Elt = peekThroughBitcasts(Elts[i]); + if (RepeatedLoads[i % SubElems].isUndef()) + RepeatedLoads[i % SubElems] = Elt; + else + Match &= (RepeatedLoads[i % SubElems] == Elt); + } + + // We must have loads at both ends of the repetition. + Match &= !RepeatedLoads.front().isUndef(); + Match &= !RepeatedLoads.back().isUndef(); + if (!Match) + continue; + + EVT RepeatVT = + VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) + ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) + : EVT::getFloatingPointVT(ScalarSize); + if (RepeatSize > ScalarSize) + RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, + RepeatSize / ScalarSize); + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), + VT.getSizeInBits() / ScalarSize); + if (TLI.isTypeLegal(BroadcastVT)) { + if (SDValue RepeatLoad = EltsFromConsecutiveLoads( + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { + unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST + : X86ISD::VBROADCAST; + SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); + return DAG.getBitcast(VT, Broadcast); + } + } + } + } + return SDValue(); } +// Combine a vector ops (shuffles etc.) that is equal to build_vector load1, +// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses +// are consecutive, non-overlapping, and in the right order. +static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool isAfterLegalize) { + SmallVector<SDValue, 64> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + return SDValue(); + } + assert(Elts.size() == VT.getVectorNumElements()); + return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, + isAfterLegalize); +} + static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); @@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue, return ConstantVector::get(ArrayRef<Constant *>(ConstantVec)); } -static bool isUseOfShuffle(SDNode *N) { +static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { - if (isTargetShuffle(U->getOpcode())) + unsigned Opc = U->getOpcode(); + // VPERMV/VPERMV3 shuffles can never fold their index operands. + if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) + return false; + if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) + return false; + if (isTargetShuffle(Opc)) + return true; + if (Opc == ISD::BITCAST) // Ignore bitcasts + return isFoldableUseOfShuffle(U); + if (N->hasOneUse()) return true; - if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts - return isUseOfShuffle(U); } return false; } @@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. - if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) + if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); + unsigned NumElts = VT.getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + for (unsigned i = 0; i != NumElts; ++i) + if (BV->getOperand(i).isUndef()) + DemandedElts.clearBit(i); + + // If we don't need the upper xmm, then perform as a xmm hop. + unsigned HalfNumElts = NumElts / 2; + if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); + V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); + V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); + SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); + return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); + } + return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); } @@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. - unsigned NumNonUndefs = 0; - for (const SDValue &V : BV->op_values()) - if (!V.isUndef()) - ++NumNonUndefs; - + unsigned NumNonUndefs = + count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); @@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); - unsigned HOpcode; - SDValue V0, V1; - if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2()) + if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || + ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || + ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || + ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { + unsigned HOpcode; + SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); + } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) @@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). + bool IsShift = false; switch (Opcode) { default: return SDValue(); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + IsShift = true; + break; case ISD::AND: case ISD::XOR: case ISD::OR: @@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, // We expect the canonicalized RHS operand to be the constant. if (!isa<ConstantSDNode>(RHS)) return SDValue(); + + // Extend shift amounts. + if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { + if (!IsShift) + return SDValue(); + RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); + } + LHSElts.push_back(LHS); RHSElts.push_back(RHS); } + // Limit to shifts by uniform immediates. + // TODO: Only accept vXi8/vXi64 special cases? + // TODO: Permit non-uniform XOP/AVX2/MULLO cases? + if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) + return SDValue(); + SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); @@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, return Vec; } -// Return true if all the operands of the given CONCAT_VECTORS node are zeros -// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) -static bool isExpandWithZeros(const SDValue &Op) { - assert(Op.getOpcode() == ISD::CONCAT_VECTORS && - "Expand with zeros only possible in CONCAT_VECTORS nodes!"); - - for (unsigned i = 1; i < Op.getNumOperands(); i++) - if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) - return false; - - return true; -} - // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. -static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { - unsigned Opc = Op.getOpcode(); - - assert(Opc == ISD::CONCAT_VECTORS && - Op.getSimpleValueType().getVectorElementType() == MVT::i1 && - "Unexpected node to check for type promotion!"); - - // As long as we are concatenating zeros to the upper part of a previous node - // result, climb up the tree until a node with different opcode is - // encountered - while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { - if (Opc == ISD::INSERT_SUBVECTOR) { - if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && - Op.getConstantOperandVal(2) == 0) - Op = Op.getOperand(1); - else - return SDValue(); - } else { // Opc == ISD::CONCAT_VECTORS - if (isExpandWithZeros(Op)) - Op = Op.getOperand(0); - else - return SDValue(); - } - Opc = Op.getOpcode(); - } - - // Check if the first inserted node zeroes the upper bits, or an 'and' result - // of a node that zeros the upper bits (its masked version). - if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || - (Op.getOpcode() == ISD::AND && - (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || - isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { - return Op; - } - - return SDValue(); -} - // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, @@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - // If this node promotes - by concatenating zeroes - the type of the result - // of a node with instruction that zeroes all upper (irrelevant) bits of the - // output register, mark it as legal and catch the pattern in instruction - // selection to avoid emitting extra instructions (for zeroing upper bits). - if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) - return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl); - unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; @@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; + assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && + "Illegal target shuffle mask"); for (int i = 0; i < Size; ++i) if (Mask[i] == SM_SentinelUndef) @@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { return IsUnpackwdMask; } +static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { + // Create 128-bit vector type based on mask size. + MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); + MVT VT = MVT::getVectorVT(EltVT, Mask.size()); + + // We can't assume a canonical shuffle mask, so try the commuted version too. + SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + + // Match any of unary/binary or low/high. + for (unsigned i = 0; i != 4; ++i) { + SmallVector<int, 16> UnpackMask; + createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); + if (isTargetShuffleEquivalent(Mask, UnpackMask) || + isTargetShuffleEquivalent(CommutedMask, UnpackMask)) + return true; + } + return false; +} + +/// Return true if a shuffle mask chooses elements identically in its top and +/// bottom halves. For example, any splat mask has the same top and bottom +/// halves. If an element is undefined in only one half of the mask, the halves +/// are not considered identical. +static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) { + assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); + unsigned HalfSize = Mask.size() / 2; + for (unsigned i = 0; i != HalfSize; ++i) { + if (Mask[i] != Mask[i + HalfSize]) + return false; + } + return true; +} + /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable, } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. -static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; @@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND -static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const APInt &Zeroable, - ArrayRef<int> Mask, SDValue &V1, - SDValue &V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, + const APInt &Zeroable, + ArrayRef<int> Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) @@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getSelect(DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. -static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, SDValue V2, + SelectionDAG &DAG) { SmallVector<int, 8> Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) @@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, // // But when avx512vl is available, one can just use a single vpmovdw // instruction. -static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (VT != MVT::v16i8 && VT != MVT::v8i16) return SDValue(); @@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, return false; } -static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, @@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { - assert(!VT.isFloatingPoint() && "Floating point types are not supported"); +static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); - SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); + SDValue Zero, AllOnes; + // Use f64 if i64 isn't legal. + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + MaskVT = MVT::getVectorVT(EltVT, Mask.size()); + } + + MVT LogicVT = VT; + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + Zero = DAG.getConstantFP(0.0, DL, EltVT); + AllOnes = DAG.getConstantFP( + APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); + LogicVT = + MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); + } else { + Zero = DAG.getConstant(0, DL, EltVT); + AllOnes = DAG.getAllOnesConstant(DL, EltVT); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - return DAG.getNode(ISD::AND, DL, VT, V, VMask); + SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); + VMask = DAG.getBitcast(LogicVT, VMask); + V = DAG.getBitcast(LogicVT, V); + SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); + return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. @@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. -static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); @@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Original, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); uint64_t BlendMask = 0; @@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v8f32: + assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); + LLVM_FALLTHROUGH; + case MVT::v2f64: case MVT::v2i64: + case MVT::v4f32: case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget.hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - LLVM_FALLTHROUGH; - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } + case MVT::v8i16: + assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget.hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); + case MVT::v32i8: + assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); + LLVM_FALLTHROUGH; + case MVT::v16i8: { + assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { @@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { + // Attempt to lower to a bitmask if we can. Only if not optimizing for size. + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!OptForSize) { + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return Masked; + } + + // Otherwise load an immediate into a GPR, cast to k-register, and use a + // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); @@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. -static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG, - bool ImmBlends = false) { +static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG, + bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector<int, 32> BlendMask(Mask.size(), -1); @@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. -static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; @@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. -static SDValue lowerVectorShuffleAsByteRotateAndPermute( +static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || @@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute( /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( +static SDValue lowerShuffleAsDecomposedShuffleBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and @@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, VT, V1, V2, Mask, DAG, true)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG, true)) return BlendPerm; - if (SDValue UnpackPerm = - lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, + DAG)) return UnpackPerm; - if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. - if (SDValue BlendPerm = - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG)) return BlendPerm; } @@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( /// Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. -static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, - ArrayRef<int> Mask) { +static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: @@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask) { +static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; @@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; - int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; @@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, return Rotation * Scale; } -static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; - int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); + int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); @@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); @@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; - int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); @@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, DAG.getConstant(Rotation, DL, MVT::i8)); } +/// Try to lower a vector shuffle as a byte shift sequence. +static SDValue lowerVectorShuffleAsByteShiftMask( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + assert(VT.is128BitVector() && "Only 128-bit vectors supported"); + + // We need a shuffle that has zeros at one/both ends and a sequential + // shuffle from one source within. + unsigned ZeroLo = Zeroable.countTrailingOnes(); + unsigned ZeroHi = Zeroable.countLeadingOnes(); + if (!ZeroLo && !ZeroHi) + return SDValue(); + + unsigned NumElts = Mask.size(); + unsigned Len = NumElts - (ZeroLo + ZeroHi); + if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) + return SDValue(); + + unsigned Scale = VT.getScalarSizeInBits() / 8; + ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len); + if (!isUndefOrInRange(StubMask, 0, NumElts) && + !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) + return SDValue(); + + SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; + Res = DAG.getBitcast(MVT::v16i8, Res); + + // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an + // inner sequential set of elements, possibly offset: + // 01234567 --> zzzzzz01 --> 1zzzzzzz + // 01234567 --> 4567zzzz --> zzzzz456 + // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz + if (ZeroLo == 0) { + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + } else if (ZeroHi == 0) { + unsigned Shift = Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else if (!Subtarget.hasSSSE3()) { + // If we don't have PSHUFB then its worth avoiding an AND constant mask + // by performing 3 byte shifts. Shuffle combining can kick in above that. + // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Shift += Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else + return SDValue(); + + return DAG.getBitcast(VT, Res); +} + /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] -static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, - unsigned ScalarSizeInBits, - ArrayRef<int> Mask, int MaskOffset, - const APInt &Zeroable, - const X86Subtarget &Subtarget) { +static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, + unsigned ScalarSizeInBits, ArrayRef<int> Mask, + int MaskOffset, const APInt &Zeroable, + const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, return -1; } -static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, unsigned Opcode; // Try to match shuffle against V1 shift. - int ShiftAmt = matchVectorShuffleAsShift( - ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { - ShiftAmt = - matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), - Mask, Size, Zeroable, Subtarget); + ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, Size, Zeroable, Subtarget); V = V2; } @@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. -static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask, uint64_t &BitLen, - uint64_t &BitIdx, const APInt &Zeroable) { +static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the @@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } -static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask, uint64_t &BitLen, - uint64_t &BitIdx) { +static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { @@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); - if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getConstant(BitLen, DL, MVT::i8), @@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. -static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( +static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); @@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. + // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. @@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; - unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); + unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), @@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || - !SafeOffset(Offset + 1)) + if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; @@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. -static SDValue lowerVectorShuffleAsZeroOrAnyExtend( +static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (Offset != 0 && Matches < 2) return SDValue(); - return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); + return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, + InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. -static SDValue lowerVectorShuffleAsElementInsertion( +static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. -static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, - SDValue V0, int BroadcastIdx, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); @@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } +/// Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + +/// If we are extracting two 128-bit halves of a vector and shuffling the +/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a +/// multi-shuffle lowering. +static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, + SDValue N1, ArrayRef<int> Mask, + SelectionDAG &DAG) { + EVT VT = N0.getValueType(); + assert((VT.is128BitVector() && + (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && + "VPERM* family of shuffles requires 32-bit or 64-bit elements"); + + // Check that both sources are extracts of the same source vector. + if (!N0.hasOneUse() || !N1.hasOneUse() || + N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N0.getOperand(0) != N1.getOperand(0)) + return SDValue(); + + SDValue WideVec = N0.getOperand(0); + EVT WideVT = WideVec.getValueType(); + if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) || + !isa<ConstantSDNode>(N1.getOperand(1))) + return SDValue(); + + // Match extracts of each half of the wide source vector. Commute the shuffle + // if the extract of the low half is N1. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 4> NewMask(Mask.begin(), Mask.end()); + const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); + const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); + if (ExtIndex1 == 0 && ExtIndex0 == NumElts) + ShuffleVectorSDNode::commuteMask(NewMask); + else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) + return SDValue(); + + // Final bailout: if the mask is simple, we are better off using an extract + // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps + // because that avoids a constant load from memory. + if (NumElts == 4 && + (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) + return SDValue(); + + // Extend the shuffle mask with undef elements. + NewMask.append(NumElts, -1); + + // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 + SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), + NewMask); + // This is free: ymm -> xmm. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, + DAG.getIntPtrConstant(0, DL)); +} + /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) @@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); + unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; @@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { - // Peek through bitcasts as long as BroadcastIdx can be adjusted. - SDValue VSrc = V.getOperand(0); - unsigned NumEltBits = V.getScalarValueSizeInBits(); - unsigned NumSrcBits = VSrc.getScalarValueSizeInBits(); - if ((NumEltBits % NumSrcBits) == 0) - BroadcastIdx *= (NumEltBits / NumSrcBits); - else if ((NumSrcBits % NumEltBits) == 0 && - (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0) - BroadcastIdx /= (NumSrcBits / NumEltBits); - else - break; - V = VSrc; + V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { - int OperandSize = - V.getOperand(0).getSimpleValueType().getVectorNumElements(); - V = V.getOperand(BroadcastIdx / OperandSize); - BroadcastIdx %= OperandSize; + int OpBitWidth = V.getOperand(0).getValueSizeInBits(); + int OpIdx = BitOffset / OpBitWidth; + V = V.getOperand(OpIdx); + BitOffset %= OpBitWidth; continue; } case ISD::INSERT_SUBVECTOR: { @@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (!ConstantIdx) break; - int BeginIdx = (int)ConstantIdx->getZExtValue(); - int EndIdx = - BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); - if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { - BroadcastIdx -= BeginIdx; + int EltBitWidth = VOuter.getScalarValueSizeInBits(); + int Idx = (int)ConstantIdx->getZExtValue(); + int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); + int BeginOffset = Idx * EltBitWidth; + int EndOffset = BeginOffset + NumSubElts * EltBitWidth; + if (BeginOffset <= BitOffset && BitOffset < EndOffset) { + BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; @@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } break; } + assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); + BroadcastIdx = BitOffset / NumEltBits; - // Ensure the source vector and BroadcastIdx are for a suitable type. - if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) { - unsigned NumEltBits = VT.getScalarSizeInBits(); - unsigned NumSrcBits = V.getScalarValueSizeInBits(); - if ((NumSrcBits % NumEltBits) == 0) - BroadcastIdx *= (NumSrcBits / NumEltBits); - else if ((NumEltBits % NumSrcBits) == 0 && - (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0) - BroadcastIdx /= (NumEltBits / NumSrcBits); - else - return SDValue(); - - unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; - MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts); - V = DAG.getBitcast(SrcVT, V); - } + // Do we need to bitcast the source to retrieve the original broadcast index? + bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. - // First, look through bitcast: if the original value has a larger element - // type than the shuffle, the broadcast element is in essence truncated. - // Make that explicit to ease folding. - if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) - if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( - DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + // If the original value has a larger element type than the shuffle, the + // broadcast element is in essence truncated. Make that explicit to ease + // folding. + if (BitCastSrc && VT.isInteger()) + if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( + DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; - // Peek through any bitcast (only useful for loads). - SDValue BC = peekThroughBitcasts(V); - // Also check the simpler case, where we can directly reuse the scalar. - if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + if (!BitCastSrc && + ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { + } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast<LoadSDNode>(BC); + LoadSDNode *Ld = cast<LoadSDNode>(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( @@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); - } else if (BroadcastIdx != 0) { + } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. @@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. - unsigned EltSize = VT.getScalarSizeInBits(); - if (((BroadcastIdx * EltSize) % 128) != 0) + if ((BitOffset % 128) != 0) return SDValue(); - // The shuffle input might have been a bitcast we looked through; look at - // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll - // later bitcast it to BroadcastVT. - assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() && - "Unexpected vector element size"); + assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && + "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); - V = extract128BitVector(V, BroadcastIdx, DAG, DL); + unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); + V = extract128BitVector(V, ExtractIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) @@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. - MVT SrcVT = V.getSimpleValueType(); - if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { - assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { + assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); - if (SrcVT.isVector()) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); + MVT ExtVT; + if (V.getValueType().isVector()) { + unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; + ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { - SrcVT = BroadcastVT.getScalarType(); + ExtVT = BroadcastVT.getScalarType(); } - V = DAG.getBitcast(SrcVT, V); + V = DAG.getBitcast(ExtVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget.is64Bit() && SrcVT == MVT::i64) { + if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); @@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. - if (SrcVT.getSizeInBits() > 128) { - MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), - 128 / SrcVT.getScalarSizeInBits()); + if (V.getValueSizeInBits() > 128) { + MVT ExtVT = V.getSimpleValueType().getScalarType(); + ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } @@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, - unsigned &InsertPSMask, - const APInt &Zeroable, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const APInt &Zeroable, + ArrayRef<int> Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef<int> Mask, const APInt &Zeroable, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; - if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. @@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsPermuteAndUnpack( +static SDValue lowerShuffleAsPermuteAndUnpack( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && @@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack( /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. -static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the @@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); @@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. -static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. @@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// Test whether this can be lowered with a single SHUFPS instruction. -/// -/// This is used to disable more specialized lowerings when the shufps lowering -/// will happen to be efficient. -static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { - // This routine only handles 128-bit shufps. - assert(Mask.size() == 4 && "Unsupported mask size!"); - assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); - assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); - assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); - assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); - - // To lower with a single SHUFPS we need to have the low half and high half - // each requiring a single input. - if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) - return false; - if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) - return false; - - return true; -} - /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. -static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; @@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. -static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. @@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. - if (SDValue V = - lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, MVT::v4f32, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, + V2, Mask, DAG)) return BlendPerm; } @@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. -static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } @@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Unpack; } @@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. -static SDValue lowerV8I16GeneralSingleInputVectorShuffle( +static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); @@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); - int NumLToL = - std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; - int NumLToH = - std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); @@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int ADWord, BDWord; + int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; @@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. -static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( +static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && @@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. -static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, - DAG, Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, + Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end()); - return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, - MutableMask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, + Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && @@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, "shuffles."); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return V; + // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG, V1InUse, V2InUse); + return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even @@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, return 0; } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); @@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. -static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use a zext lowering. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); @@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. @@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; } - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; + + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool V1InUse = false; bool V2InUse = false; - SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( + SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, @@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some @@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. - if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } @@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; - if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) - return BitBlend; + if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for @@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together @@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. -static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: - return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: - return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: - return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: - return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: - return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: - return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); @@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. -static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); @@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. -static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); @@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, return true; }; if (DoBothBroadcast()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, + DAG); } /// Lower a vector shuffle crossing multiple 128-bit lanes as @@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// -/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes, +/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. -static SDValue lowerVectorShuffleAsLanePermuteAndPermute( +static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); @@ -13884,7 +14401,6 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( int NumEltsPerLane = NumElts / NumLanes; SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef); - SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef); SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { @@ -13899,10 +14415,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( return SDValue(); SrcLaneMask[DstLane] = SrcLane; - LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane); PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); } + // Make sure we set all elements of the lane mask, to avoid undef propagation. + SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef); + for (int DstLane = 0; DstLane != NumLanes; ++DstLane) { + int SrcLane = SrcLaneMask[DstLane]; + if (0 <= SrcLane) + for (int j = 0; j != NumEltsPerLane; ++j) { + LaneMask[(DstLane * NumEltsPerLane) + j] = + (SrcLane * NumEltsPerLane) + j; + } + } + // If we're only shuffling a single lowest lane and the rest are identity // then don't bother. // TODO - isShuffleMaskInputInPlace could be extended to something like this. @@ -13931,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. -static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleAsLanePermuteAndBlend( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); @@ -13950,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] / LaneSize)] = true; if (!LaneUsed[0] || !LaneUsed[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } assert(V2.isUndef() && @@ -13981,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, } /// Handle lowering 2-lane 128-bit shuffles. -static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); @@ -14012,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask @@ -14084,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. -/// -/// FIXME: This should be generalized to 512-bit shuffles. -static SDValue lowerVectorShuffleByMerging128BitLanes( +static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); @@ -14095,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return SDValue(); int Size = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; int LaneSize = 128 / VT.getScalarSizeInBits(); - int NumLanes = Size / LaneSize; - assert(NumLanes == 2 && "Only handles 256-bit shuffles."); - SmallVector<int, 16> RepeatMask(LaneSize, -1); - int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } }; + SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. @@ -14111,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( int M = Mask[(Lane * LaneSize) + i]; if (M < 0) continue; - // Determine which of the 4 possible input lanes (2 from each source) + // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. @@ -14250,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } -/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. -/// This allows for fast cases such as subvector extraction/insertion -/// or shuffling smaller vector types which can lower more efficiently. -static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert((VT.is256BitVector() || VT.is512BitVector()) && - "Expected 256-bit or 512-bit vector"); - - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - - bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); - bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); - if (!UndefLower && !UndefUpper) - return SDValue(); - - // Upper half is undef and lower half is whole upper subvector. - // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (UndefUpper && - isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(HalfNumElts, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(0, DL)); - } - - // Lower half is undef and upper half is whole lower subvector. - // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - if (UndefLower && - isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(HalfNumElts, DL)); - } +/// If the input shuffle mask results in a vector that is undefined in all upper +/// or lower half elements and that mask accesses only 2 halves of the +/// shuffle's operands, return true. A mask of half the width with mask indexes +/// adjusted to access the extracted halves of the original shuffle operands is +/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or +/// lower half of each input operand is accessed. +static bool +getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask, + int &HalfIdx1, int &HalfIdx2) { + assert((Mask.size() == HalfMask.size() * 2) && + "Expected input mask to be twice as long as output"); + + // Exactly one half of the result must be undef to allow narrowing. + bool UndefLower = isUndefLowerHalf(Mask); + bool UndefUpper = isUndefUpperHalf(Mask); + if (UndefLower == UndefUpper) + return false; - // If the shuffle only uses two of the four halves of the input operands, - // then extract them and perform the 'half' shuffle at half width. - // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> - int HalfIdx1 = -1, HalfIdx2 = -1; - SmallVector<int, 8> HalfMask(HalfNumElts); - unsigned Offset = UndefLower ? HalfNumElts : 0; + unsigned HalfNumElts = HalfMask.size(); + unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; + HalfIdx1 = -1; + HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { - int M = Mask[i + Offset]; + int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; @@ -14324,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, } // Too many half vectors referenced. - return SDValue(); + return false; } - assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); - - // Only shuffle the halves of the inputs when useful. - int NumLowerHalves = - (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); - int NumUpperHalves = - (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); - - // uuuuXXXX - don't extract uppers just to insert again. - if (UndefLower && NumUpperHalves != 0) - return SDValue(); - // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. - if (UndefUpper && NumUpperHalves == 2) - return SDValue(); + return true; +} - // AVX2 - XXXXuuuu - always extract lowers. - if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { - // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. - if (VT == MVT::v4f64 || VT == MVT::v4i64) - return SDValue(); - // AVX2 supports variable 32-bit element cross-lane shuffles. - if (VT == MVT::v8f32 || VT == MVT::v8i32) { - // XXXXuuuu - don't extract lowers and uppers. - if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) - return SDValue(); - } - } +/// Given the output values from getHalfShuffleMask(), create a half width +/// shuffle of extracted vectors followed by an insert back to full width. +static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef<int> HalfMask, int HalfIdx1, + int HalfIdx2, bool UndefLower, + SelectionDAG &DAG) { + assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); + assert(V1.getValueType().isSimple() && "Expecting only simple types"); - // AVX512 - XXXXuuuu - always extract lowers. - if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0)) - return SDValue(); + MVT VT = V1.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - auto GetHalfVector = [&](int HalfIdx) { + auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); @@ -14368,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, DAG.getIntPtrConstant(HalfIdx, DL)); }; - SDValue Half1 = GetHalfVector(HalfIdx1); - SDValue Half2 = GetHalfVector(HalfIdx2); + // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset + SDValue Half1 = getHalfVector(HalfIdx1); + SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } +/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.is256BitVector() || VT.is512BitVector()) && + "Expected 256-bit or 512-bit vector"); + + bool UndefLower = isUndefLowerHalf(Mask); + if (!UndefLower && !isUndefUpperHalf(Mask)) + return SDValue(); + + assert((!UndefLower || !isUndefUpperHalf(Mask)) && + "Completely undef shuffle mask should have been simplified already"); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + if (!UndefLower && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + int HalfIdx1, HalfIdx2; + SmallVector<int, 8> HalfMask(HalfNumElts); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) + return SDValue(); + + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + // Only shuffle the halves of the inputs when useful. + unsigned NumLowerHalves = + (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); + unsigned NumUpperHalves = + (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); + assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); + + // Determine the larger pattern of undef/halves, then decide if it's worth + // splitting the shuffle based on subtarget capabilities and types. + unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); + if (!UndefLower) { + // XXXXuuuu: no insert is needed. + // Always extract lowers when setting lower - these are all free subreg ops. + if (NumUpperHalves == 0) + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + + if (NumUpperHalves == 1) { + // AVX2 has efficient 32/64-bit element cross-lane shuffles. + if (Subtarget.hasAVX2()) { + // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. + if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && + !is128BitUnpackShuffleMask(HalfMask) && + (!isSingleSHUFPSMask(HalfMask) || + Subtarget.hasFastVariableShuffle())) + return SDValue(); + // If this is a unary shuffle (assume that the 2nd operand is + // canonicalized to undef), then we can use vpermpd. Otherwise, we + // are better off extracting the upper half of 1 operand and using a + // narrow shuffle. + if (EltWidth == 64 && V2.isUndef()) + return SDValue(); + } + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Extract + narrow shuffle is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // Don't extract both uppers, instead shuffle and then extract. + assert(NumUpperHalves == 2 && "Half vector count went wrong"); + return SDValue(); + } + + // UndefLower - uuuuXXXX: an insert to high half is required if we split this. + if (NumUpperHalves == 0) { + // AVX2 has efficient 64-bit element cross-lane shuffles. + // TODO: Refine to account for unary shuffle, splat, and other masks? + if (Subtarget.hasAVX2() && EltWidth == 64) + return SDValue(); + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Narrow shuffle + insert is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. + return SDValue(); +} + /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -14560,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( SubLaneMask); } -static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, - unsigned &ShuffleImm, - ArrayRef<int> Mask) { +static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, + unsigned &ShuffleImm, ArrayRef<int> Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -14597,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; - if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, @@ -14615,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. @@ -14659,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( - DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, + Mask, DAG, Subtarget)) return V; // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, + Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14694,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) - return Result; + return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. -static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { @@ -14763,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14800,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more @@ -14849,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -14875,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. -static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -14926,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split @@ -14935,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more @@ -14961,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15006,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. -static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15039,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15082,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector<int, 8> RepeatedMask; @@ -15095,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. -static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15141,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15183,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, + Subtarget); } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. @@ -15220,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we @@ -15251,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. - if (SDValue V = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), @@ -15268,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, switch (VT.SimpleTy) { case MVT::v4f64: - return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: - return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: - return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: - return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: - return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: - return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); @@ -15286,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } /// Try to lower a vector shuffle as a 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); @@ -15388,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, } /// Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15419,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, - Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15471,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) - return Unpck; + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the @@ -15492,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; - return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. -static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15530,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable, - V1, V2, Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15578,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; @@ -15595,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. @@ -15621,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. -static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15650,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (V2.isUndef()) { @@ -15675,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; - return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. -static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); @@ -15705,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) - return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. @@ -15743,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (!V2.isUndef()) + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// High-level routine to lower various 512-bit x86 vector shuffles. @@ -15756,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); @@ -15770,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for @@ -15790,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: - return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: - return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: - return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); @@ -15809,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified -// version of matchVectorShuffleAsShift. +// version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); @@ -15844,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask, // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. -static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); @@ -16037,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. - if (V2IsUndef) - for (int M : Mask) - if (M >= NumElements) { - SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); - for (int &M : NewMask) - if (M >= NumElements) - M = -1; - return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); - } + if (V2IsUndef && + any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); + } // Check for illegal shuffle mask element index values. int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; @@ -16083,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() @@ -16122,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); - if (SDValue V = - lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) - return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) - return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) - return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -16401,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - DAG.getConstant(IdxVal, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); @@ -16527,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); - if (!isa<ConstantSDNode>(N2)) + + auto *N2C = dyn_cast<ConstantSDNode>(N2); + if (!N2C || N2C->getAPIntValue().uge(NumElts)) return SDValue(); - auto *N2C = cast<ConstantSDNode>(N2); - unsigned IdxVal = N2C->getZExtValue(); + uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); @@ -16575,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(IdxIn128, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); + // This will be just movd/movq/movss/movsd. + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && + (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || + EltVT == MVT::i64)) { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + } + // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { @@ -16613,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -16663,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - assert(OpVT.is128BitVector() && "Expected an SSE type!"); + assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && + "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) @@ -16789,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue -X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { - const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); - - // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the - // global base reg. - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); - - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); - - SDLoc DL(Op); - Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); - - // With PIC, the address is actually $g + Offset. - if (OpFlag) { - Result = - DAG.getNode(ISD::ADD, DL, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); - } - - // For symbols that require a load from a stub to get the address, emit the - // load. - if (isGlobalStubReference(OpFlag)) - Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - - return Result; +SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue @@ -16841,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, - const SDLoc &dl, int64_t Offset, - SelectionDAG &DAG) const { - // Create the TargetGlobalAddress node, folding in the constant - // offset if it is legal. - unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); +/// Creates target global address or external symbol nodes for calls or +/// other uses. +SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, + bool ForCall) const { + // Unpack the global address or external symbol. + const SDLoc &dl = SDLoc(Op); + const GlobalValue *GV = nullptr; + int64_t Offset = 0; + const char *ExternalSym = nullptr; + if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) { + GV = G->getGlobal(); + Offset = G->getOffset(); + } else { + const auto *ES = cast<ExternalSymbolSDNode>(Op); + ExternalSym = ES->getSymbol(); + } + + // Calculate some flags for address lowering. + const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); + unsigned char OpFlags; + if (ForCall) + OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); + else + OpFlags = Subtarget.classifyGlobalReference(GV, Mod); + bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); + bool NeedsLoad = isGlobalStubReference(OpFlags); + CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; - if (OpFlags == X86II::MO_NO_FLAG && - X86::isOffsetSuitableForCodeModel(Offset, M)) { - // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); - Offset = 0; + + if (GV) { + // Create a target global address if this is a global. If possible, fold the + // offset into the global address reference. Otherwise, ADD it on later. + int64_t GlobalOffset = 0; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + std::swap(GlobalOffset, Offset); + } + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); + // If this is not a global address, this must be an external symbol. + Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } + // If this is a direct call, avoid the wrapper if we don't need to do any + // loads or adds. This allows SDAG ISel to match direct calls. + if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) + return Result; + Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. - if (isGlobalRelativeToPICBase(OpFlags)) { + if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. - if (isGlobalStubReference(OpFlags)) + if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); @@ -16884,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); - return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue @@ -17112,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } - if (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium() || - Subtarget.isTargetWindowsGNU()) { + if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -17254,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, APInt APIntShiftAmt; if (isConstantSplat(Amt, APIntShiftAmt)) { - uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); + uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); } @@ -17267,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -17311,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, DAG.getIntPtrConstant(0, dl)); } +static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, + const X86Subtarget &Subtarget) { + switch (Opcode) { + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD + return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); + + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) + return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; + } +} + +/// Given a scalar cast operation that is extracted from a vector, try to +/// vectorize the cast op followed by extraction. This will avoid an expensive +/// round-trip between XMM and GPR. +static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: This could be enhanced to handle smaller integer types by peeking + // through an extend. + SDValue Extract = Cast.getOperand(0); + MVT DestVT = Cast.getSimpleValueType(); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Extract.getOperand(1))) + return SDValue(); + + // See if we have a 128-bit vector cast op for this type of cast. + SDValue VecOp = Extract.getOperand(0); + MVT FromVT = VecOp.getSimpleValueType(); + unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); + MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); + if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) + return SDValue(); + + // If we are extracting from a non-zero element, first shuffle the source + // vector to allow extracting from element zero. + SDLoc DL(Cast); + if (!isNullConstant(Extract.getOperand(1))) { + SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1); + Mask[0] = Extract.getConstantOperandVal(1); + VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); + } + // If the source vector is wider than 128-bits, extract the low part. Do not + // create an unnecessarily wide vector cast op. + if (FromVT != Vec128VT) + VecOp = extract128BitVector(VecOp, 0, DAG, DL); + + // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 + // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 + SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, + DAG.getIntPtrConstant(0, DL)); +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -17318,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, @@ -17371,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); - unsigned ByteSize = SrcVT.getSizeInBits()/8; + unsigned ByteSize = SrcVT.getSizeInBits() / 8; FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); - MachineMemOperand *MMO; + MachineMemOperand *LoadMMO; if (FI) { int SSFI = FI->getIndex(); - MMO = DAG.getMachineFunction().getMachineMemOperand( + LoadMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { - MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } - SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; - SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : - X86ISD::FILD, DL, - Tys, Ops, SrcVT, MMO); + SDValue FILDOps[] = {Chain, StackSlot}; + SDValue Result = + DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, + Tys, FILDOps, SrcVT, LoadMMO); if (useSSE) { Chain = Result.getValue(1); @@ -17397,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - unsigned SSFISize = Op.getValueSizeInBits()/8; + unsigned SSFISize = Op.getValueSizeInBits() / 8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); - SDValue Ops[] = { - Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag - }; - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; + MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); - Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, Op.getValueType(), MMO); + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, + Op.getValueType(), StoreMMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); @@ -17545,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); + SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); // Clear upper part of LO, lower HI. SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); @@ -17680,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); @@ -17732,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); @@ -17768,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), -// just return an <SDValue(), SDValue()> pair. +// just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 -// to i16, i32 or i64, and we lower it to a legal sequence. -// If lowered to the final integer result we return a <result, SDValue()> pair. -// Otherwise we lower it to a sequence ending with a FIST, return a -// <FIST, StackSlot> pair, and the caller is responsible for loading -// the final integer result from StackSlot. -std::pair<SDValue,SDValue> +// to i16, i32 or i64, and we lower it to a legal sequence and return the +// result. +SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { + bool IsSigned) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); @@ -17787,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. - return std::make_pair(SDValue(), SDValue()); + return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. - bool UnsignedFixup = !IsSigned && - DstTy == MVT::i64 && - (!Subtarget.is64Bit() || - !isScalarFPTypeInSSEReg(TheVT)); + bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; - if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { + if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); @@ -17809,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); - // These are really Legal. - if (DstTy == MVT::i32 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - if (Subtarget.is64Bit() && - DstTy == MVT::i64 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); - unsigned MemSize = DstTy.getSizeInBits()/8; + unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } - SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. @@ -17874,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); - Adjust = DAG.getSelect(DL, MVT::i32, Cmp, - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0x80000000, DL, MVT::i32)); + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, + DAG.getConstant(0, DL, MVT::i64), + DAG.getConstant(APInt::getSignMask(64), + DL, MVT::i64)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), @@ -17884,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); - Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(MF, SSFI)); - SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); - SDValue Ops[] = { - Chain, StackSlot, DAG.getValueType(TheVT) - }; - - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); + SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); + SDValue Ops[] = { Chain, StackSlot }; + + unsigned FLDSize = TheVT.getStoreSize(); + assert(FLDSize <= MemSize && "Stack slot not big enough"); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); - SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); - StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); - - if (UnsignedFixup) { - - // Insert the FIST, load its result as two i32's, - // and XOR the high i32 with Adjust. + // Build the FP_TO_INT*_IN_MEM + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemSize, MemSize); + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, + DAG.getVTList(MVT::Other), + Ops, DstTy, MMO); - SDValue FistOps[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - FistOps, DstTy, MMO); + SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); - SDValue Low32 = - DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo()); - SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); + // If we need an unsigned fixup, XOR the result with adjust. + if (UnsignedFixup) + Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); - SDValue High32 = - DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo()); - High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); - - if (Subtarget.is64Bit()) { - // Join High32 and Low32 into a 64-bit result. - // (High32 << 32) | Low32 - Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); - High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); - High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, - DAG.getConstant(32, DL, MVT::i8)); - SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); - return std::make_pair(Result, SDValue()); - } - - SDValue ResultOps[] = { Low32, High32 }; - - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) - : DAG.getMergeValues(ResultOps, DL); - return std::make_pair(pair, SDValue()); - } else { - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, DstTy, MMO); - return std::make_pair(FIST, StackSlot); - } + return Res; } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - MVT VT = Op->getSimpleValueType(0); - SDValue In = Op->getOperand(0); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); + assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && + "Unexpected extension opcode"); assert(VT.getVectorNumElements() == VT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || @@ -17970,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); + unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); + // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) @@ -17977,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input. - return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In); + return DAG.getNode(ExtendInVecOpc, dl, VT, In); } if (Subtarget.hasInt256()) @@ -18000,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In); + SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); + + // Short-circuit if we can determine that each 128-bit half is the same value. + // Otherwise, this is difficult to match and optimize. + if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In)) + if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); - bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); @@ -18179,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); + // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. + SmallVector<int, 64> Mask; + int Scale = 64 / OutVT.getScalarSizeInBits(); + scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask); + Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); @@ -18422,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; MVT VT = Op.getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + SDLoc dl(Op); if (VT.isVector()) { - SDValue Src = Op.getOperand(0); - SDLoc dl(Op); - - if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) { + if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; @@ -18447,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); - if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { + if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); @@ -18458,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { assert(!VT.isVector()); - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, - IsSigned, /*IsReplace=*/ false); - SDValue FIST = Vals.first, StackSlot = Vals.second; - // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) + bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); + + if (!IsSigned && Subtarget.hasAVX512()) { + // Conversions from f32/f64 should be legal. + if (UseSSEReg) + return Op; + + // Use default expansion. + if (VT == MVT::i64) + return SDValue(); + } + + // Promote i16 to i32 if we can use a SSE operation. + if (VT == MVT::i16 && UseSSEReg) { + assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + // If this is a SINT_TO_FP using SSEReg we're done. + if (UseSSEReg && IsSigned) return Op; - if (StackSlot.getNode()) - // Load the result. - return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); + // Fall back to X87. + if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) + return V; - // The node is the result. - return FIST; + llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { @@ -18491,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize(); + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -18513,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, if (!IsFP && !Subtarget.hasSSSE3()) return Op; - // Defer forming the minimal horizontal op if the vector source has more than - // the 2 extract element uses that we're matching here. In that case, we might - // form a horizontal op that includes more than 1 add/sub op. + // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || - !LHS.getOperand(0)->hasNUsesOfValue(2, 0)) - return Op; - - if (!isa<ConstantSDNode>(LHS.getOperand(1)) || + !isa<ConstantSDNode>(LHS.getOperand(1)) || !isa<ConstantSDNode>(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; @@ -18540,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); - if (LExtIndex == 1 && RExtIndex == 0 && + if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); - // TODO: This can be extended to handle other adjacent extract pairs. - if (LExtIndex != 0 || RExtIndex != 1) + if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); + unsigned NumLanes = BitWidth / 128; + unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit - // equivalent, so extract the 256/512-bit source op to 128-bit. - // This is free: ymm/zmm -> xmm. + // equivalent, so extract the 256/512-bit source op to 128-bit if we can. SDLoc DL(Op); - if (BitWidth == 256 || BitWidth == 512) - X = extract128BitVector(X, 0, DAG, DL); + if (BitWidth == 256 || BitWidth == 512) { + unsigned LaneIdx = LExtIndex / NumEltsPerLane; + X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); + LExtIndex %= NumEltsPerLane; + } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 + // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, - DAG.getIntPtrConstant(0, DL)); + DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -18732,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); } -// Check whether an OR'd tree is PTEST-able. -static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, - const X86Subtarget &Subtarget, - SelectionDAG &DAG, - SDValue &X86CC) { - assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); - - if (!Subtarget.hasSSE41()) - return SDValue(); - - if (!Op->hasOneUse()) - return SDValue(); - - SDNode *N = Op.getNode(); - SDLoc DL(N); - +/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) +/// style scalarized (associative) reduction patterns. +static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, + SmallVectorImpl<SDValue> &SrcOps) { SmallVector<SDValue, 8> Opnds; - DenseMap<SDValue, unsigned> VecInMap; - SmallVector<SDValue, 8> VecIns; + DenseMap<SDValue, APInt> SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. - Opnds.push_back(N->getOperand(0)); - Opnds.push_back(N->getOperand(1)); + assert(Op.getOpcode() == unsigned(BinOp) && + "Unexpected bit reduction opcode"); + Opnds.push_back(Op.getOperand(0)); + Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; - // BFS traverse all OR'd operands. - if (I->getOpcode() == ISD::OR) { + // BFS traverse all BinOp operands. + if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. @@ -18771,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + return false; // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa<ConstantSDNode>(Idx)) - return SDValue(); + return false; - SDValue ExtractedFromVec = I->getOperand(0); - DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); - if (M == VecInMap.end()) { - VT = ExtractedFromVec.getValueType(); - // Quit if not 128/256-bit vector. - if (!VT.is128BitVector() && !VT.is256BitVector()) - return SDValue(); + SDValue Src = I->getOperand(0); + DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src); + if (M == SrcOpMap.end()) { + VT = Src.getValueType(); // Quit if not the same type. - if (VecInMap.begin() != VecInMap.end() && - VT != VecInMap.begin()->first.getValueType()) - return SDValue(); - M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; - VecIns.push_back(ExtractedFromVec); + if (SrcOpMap.begin() != SrcOpMap.end() && + VT != SrcOpMap.begin()->first.getValueType()) + return false; + unsigned NumElts = VT.getVectorNumElements(); + APInt EltCount = APInt::getNullValue(NumElts); + M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; + SrcOps.push_back(Src); } - M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + // Quit if element already used. + unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (M->second[CIdx]) + return false; + M->second.setBit(CIdx); } - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Not extracted from 128-/256-bit vector."); + // Quit if not all elements are used. + for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(), + E = SrcOpMap.end(); + I != E; ++I) { + if (!I->second.isAllOnesValue()) + return false; + } - unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + return true; +} - for (DenseMap<SDValue, unsigned>::const_iterator - I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { - // Quit if not all elements are used. - if (I->second != FullMask) - return SDValue(); - } +// Check whether an OR'd tree is PTEST-able. +static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, SDValue &X86CC) { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget.hasSSE41() || !Op->hasOneUse()) + return SDValue(); + + SmallVector<SDValue, 8> VecIns; + if (!matchBitOpReduction(Op, ISD::OR, VecIns)) + return SDValue(); + // Quit if not 128/256-bit vector. + EVT VT = VecIns[0].getValueType(); + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + + SDLoc DL(Op); MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. @@ -18822,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, - DL, MVT::i8); - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, + MVT::i8); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// return true if \c Op has a use that doesn't just read flags. @@ -18963,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG, Subtarget); - if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || - Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Only promote the compare up to I32 if it is a 16 bit operation - // with an immediate. 16 bit immediates are to be avoided. - if (Op0.getValueType() == MVT::i16 && - ((isa<ConstantSDNode>(Op0) && - !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) || - (isa<ConstantSDNode>(Op1) && - !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) && - !DAG.getMachineFunction().getFunction().optForMinSize() && - !Subtarget.isAtom()) { + EVT CmpVT = Op0.getValueType(); + + if (CmpVT.isFloatingPoint()) + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + + assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || + CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); + + // Only promote the compare up to I32 if it is a 16 bit operation + // with an immediate. 16 bit immediates are to be avoided. + if (CmpVT == MVT::i16 && !Subtarget.isAtom() && + !DAG.getMachineFunction().getFunction().hasMinSize()) { + ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0); + ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1); + // Don't do this if the immediate can fit in 8-bits. + if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || + (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); - Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { + // For equality comparisons try to use SIGN_EXTEND if the input was + // truncate from something with enough sign bits. + if (Op0.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op0.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } else if (Op1.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op1.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } + } + + CmpVT = MVT::i32; + Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); + Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } - // Use SUB instead of CMP to enable CSE between SUB and CMP. - SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return SDValue(Sub.getNode(), 1); } - assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!"); - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); + return Sub.getValue(1); } /// Convert a comparison if required by the subtarget. @@ -19146,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -19290,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } -/// Given a simple buildvector constant, return a new vector constant with each -/// element decremented. If decrementing would result in underflow or this -/// is not a simple vector constant, return an empty value. -static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { +/// Given a buildvector constant, return a new vector constant with each element +/// incremented or decremented. If incrementing or decrementing would result in +/// unsigned overflow or underflow or this is not a simple vector constant, +/// return an empty value. +static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode()); if (!BV) return SDValue(); @@ -19308,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); - // Avoid underflow. - if (Elt->getAPIntValue().isNullValue()) + // Avoid overflow/underflow. + const APInt &EltC = Elt->getAPIntValue(); + if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); - NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT)); + NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); @@ -19344,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = decrementVectorConstant(Op1, DAG); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } + case ISD::SETUGT: { + // If the comparison is against a constant, we can turn this into a setuge. + // This is beneficial because materializing a constant 0 for the PCMPEQ is + // probably cheaper than XOR+PCMPGT using 2 different vector constants: + // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + if (!UGEOp1) + return SDValue(); + Op1 = Op0; + Op0 = UGEOp1; + break; + } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); @@ -19446,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); - // Break 256-bit integer vector compare into smaller ones. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntVSETCC(Op, DAG); - // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, @@ -19503,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } } + // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. + if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && + Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { + ConstantSDNode *C1 = isConstOrConstSplat(Op1); + if (C1 && C1->getAPIntValue().isPowerOf2()) { + unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; + + SDValue Result = Op0.getOperand(0); + Result = DAG.getNode(ISD::SHL, dl, VT, Result, + DAG.getConstant(ShiftAmt, dl, VT)); + Result = DAG.getNode(ISD::SRA, dl, VT, Result, + DAG.getConstant(BitWidth - 1, dl, VT)); + return Result; + } + } + + // Break 256-bit integer vector compare into smaller ones. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntVSETCC(Op, DAG); + // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. @@ -19530,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. - // TODO: This could be extended to handle a non-splat constant by checking - // that each element of the constant is not the max/null value. - APInt C; - if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) { + if (Cond == ISD::SETUGT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isMaxValue(); + })) { // X > C --> X >= (C+1) --> X == umax(X, C+1) - Op1 = DAG.getConstant(C + 1, dl, VT); + Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETUGE; } - if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) { + if (Cond == ISD::SETULT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isNullValue(); + })) { // X < C --> X <= (C-1) --> X == umin(X, C-1) - Op1 = DAG.getConstant(C - 1, dl, VT); + Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETULE; } bool Invert = false; @@ -19826,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { break; case ISD::UADDO: BaseOp = X86ISD::ADD; - Cond = X86::COND_B; + Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; @@ -19867,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); + assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } @@ -20036,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); + SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Zero = DAG.getConstant(0, DL, Op.getValueType()); - return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); + return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, @@ -20111,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; @@ -20120,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || - Opc == X86ISD::BT) { // FIXME + Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } @@ -20193,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - // Promote i16 cmovs if it won't prevent folding a load. - if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) { + // Or finally, promote i8 cmovs if we have CMOV, + // or i16 cmovs if it won't prevent folding a load. + // FIXME: we should not limit promotion of i8 case to only when the CMOV is + // legal, but EmitLoweredSelect() can not deal with these extensions + // being inserted between two CMOV's. (in i16 case too TBN) + // https://bugs.llvm.org/show_bug.cgi?id=40974 + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || + (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && + !MayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -20453,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +/// Change a vector store into a pair of half-size vector stores. +static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert((StoredVal.getValueType().is256BitVector() || + StoredVal.getValueType().is512BitVector()) && + "Expecting 256/512-bit op"); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreVT = StoredVal.getSimpleValueType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; + unsigned HalfAlign = (128 == HalfSize ? 16 : 32); + + SDLoc DL(Store); + SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); + SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); + SDValue Ptr0 = Store->getBasePtr(); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); + unsigned Alignment = Store->getAlignment(); + SDValue Ch0 = + DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), + Alignment, Store->getMemOperand()->getFlags()); + SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, + Store->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Store->getMemOperand()->getFlags()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); +} + +/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar +/// type. +static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, + SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert(StoreVT.is128BitVector() && + StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); + StoredVal = DAG.getBitcast(StoreVT, StoredVal); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreSVT = StoreVT.getScalarType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned ScalarSize = StoreSVT.getStoreSize(); + unsigned Alignment = Store->getAlignment(); + + SDLoc DL(Store); + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Offset = i * ScalarSize; + SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, + DAG.getIntPtrConstant(i, DL)); + SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, + Store->getPointerInfo().getWithOffset(Offset), + MinAlign(Alignment, Offset), + Store->getMemOperand()->getFlags()); + Stores.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); @@ -20482,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); + // If this is a 256-bit store of concatenated ops, we are better off splitting + // that store into two 128-bit stores. This avoids spurious use of 256-bit ops + // and each half can execute independently. Some cores would split the op into + // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); + if (StoreVT.is256BitVector()) { + SmallVector<SDValue, 4> CatOps; + if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) + return splitVectorStore(St, DAG); + return SDValue(); + } + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != TargetLowering::TypeWidenVector) return SDValue(); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit element - // and store it. MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); - MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; - MVT CastVT = MVT::getVectorVT(StVT, 2); - StoredVal = DAG.getBitcast(CastVT, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, - DAG.getIntPtrConstant(0, dl)); - return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); + if (Subtarget.hasSSE2()) { + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, + St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we @@ -20694,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -21240,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector<SDValue, 8> Elts; unsigned NumElts = SrcOp->getNumOperands(); - ConstantSDNode *ND; - switch(Opc) { + switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } @@ -21443,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, DAG.getBitcast(MVT::v8i1, Mask), DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND || + Op.getOpcode() == X86ISD::FSETCCM_SAE || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); @@ -21517,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { - if (!isa<ConstantSDNode>(Rnd)) - return false; + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; + + return false; + }; + auto isRoundModeSAE = [](SDValue Rnd) { + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; - unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); - return Round == X86::STATIC_ROUNDING::CUR_DIRECTION; + return false; + }; + auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { + RC = C->getZExtValue(); + if (RC & X86::STATIC_ROUNDING::NO_EXC) { + // Clear the NO_EXC bit and check remaining bits. + RC ^= X86::STATIC_ROUNDING::NO_EXC; + return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || + RC == X86::STATIC_ROUNDING::TO_NEG_INF || + RC == X86::STATIC_ROUNDING::TO_POS_INF || + RC == X86::STATIC_ROUNDING::TO_ZERO; + } + } + + return false; }; SDLoc dl(Op); @@ -21537,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(2); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Rnd); - } + Op.getOperand(1), + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); } + case INTR_TYPE_1OP_SAE: { + SDValue Sae = Op.getOperand(2); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); + } case INTR_TYPE_2OP: { SDValue Src2 = Op.getOperand(2); @@ -21553,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(3); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Src2, Rnd); - } + Op.getOperand(1), Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Src2); } + case INTR_TYPE_2OP_SAE: { + SDValue Sae = Op.getOperand(3); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } case INTR_TYPE_3OP: case INTR_TYPE_3OP_IMM8: { SDValue Src1 = Op.getOperand(1); @@ -21577,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src1, Src2, Src3, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), @@ -21590,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); - case INTR_TYPE_1OP_MASK_RM: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - SDValue RoundingMode; - // We always add rounding mode to the Node. - // If the rounding mode is not specified, we add the - // "current direction" mode. - if (Op.getNumOperands() == 4) - RoundingMode = - DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - else - RoundingMode = Op.getOperand(4); - assert(IntrData->Opc1 == 0 && "Unexpected second opcode!"); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - RoundingMode), - Mask, PassThru, Subtarget, DAG); - } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". + // - RC Opcode is specified and + // - RC is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getVectorMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src, DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, PassThru, Subtarget, DAG); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_1OP_MASK_SAE: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Rnd = Op.getOperand(4); + + unsigned Opc; + if (isRoundModeCurDirection(Rnd)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Rnd)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); + } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21641,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getScalarMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, passThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Rnd), - Mask, passThru, Subtarget, DAG); + return SDValue(); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), @@ -21654,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); + unsigned Opc = IntrData->Opc0; if (HasRounding) { SDValue Sae = Op.getOperand(6); - if (!isRoundModeCurDirection(Sae)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, passThru, Subtarget, DAG); + if (isRoundModeSAE(Sae)) + Opc = IntrWithRoundingModeOpcode; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } - case INTR_TYPE_SCALAR_MASK_RM: { + case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); - SDValue Src0 = Op.getOperand(3); + SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // There are 2 kinds of intrinsics in this group: - // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands - // (2) With rounding mode and sae - 7 operands. - if (Op.getNumOperands() == 6) { - SDValue Sae = Op.getOperand(5); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - Sae), - Mask, Src0, Subtarget, DAG); - } - assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); - SDValue RoundingMode = Op.getOperand(5); - SDValue Sae = Op.getOperand(6); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, Src0, Subtarget, DAG); + SDValue Rnd = Op.getOperand(5); + + SDValue NewOp; + unsigned RC = 0; + if (isRoundModeCurDirection(Rnd)) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + else if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else + return SDValue(); + + return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_SAE: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue Sae = Op.getOperand(5); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { + SDValue NewOp; + if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } - // TODO: Intrinsics should have fast-math-flags to propagate. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), - Mask, PassThru, Subtarget, DAG); + if (!NewOp) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK_RM: { + case INTR_TYPE_2OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (6 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 6) - Rnd = Op.getOperand(5); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Rnd), + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_SCALAR_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, - Src2, Src3), + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3), + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case BLENDV: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + + EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); + Src3 = DAG.getBitcast(MaskVT, Src3); + + // Reverse the operands to match VSELECT order. + return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); + } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21783,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // first. return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case CVTPD2PS: - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), - DAG.getIntPtrConstant(0, dl)); - case CVTPD2PS_RND_MASK: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } - } - assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!"); - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - DAG.getIntPtrConstant(0, dl)), - Mask, PassThru, Subtarget, DAG); - } case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); @@ -21829,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); - SDValue Cmp; SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Rnd); + SDValue Sae = Op.getOperand(4); + if (isRoundModeSAE(Sae)) + return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Sae); + if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if (!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); - - return Cmp; } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -21856,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Cmp; if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if(!Cmp.getNode()) + if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), @@ -21921,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); - else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + else if (isRoundModeSAE(Sae)) + FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); + else + return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, @@ -21940,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnesConstant(Mask)) // return data as is + if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is return Op.getOperand(1); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - DataToCompress), - Mask, PassThru, Subtarget, DAG); + // Avoid false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, VT); + + return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, + Mask); } - case FIXUPIMMS: - case FIXUPIMMS_MASKZ: case FIXUPIMM: - case FIXUPIMM_MASKZ:{ + case FIXUPIMM_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? - Src1 : getZeroVector(VT, Subtarget, DAG, dl); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (7 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 7) - Rnd = Op.getOperand(6); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); - else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); + SDValue Passthru = (IntrData->Type == FIXUPIMM) + ? Src1 + : getZeroVector(VT, Subtarget, DAG, dl); + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); + + if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) + return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); + + return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); @@ -22018,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: - case CVTPD2I_MASK: + case CVTPD2DQ_MASK: + case CVTQQ2PS_MASK: case TRUNCATE_TO_REG: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -22049,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, PassThru, Mask); } + case CVTNEPS2BF16_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); + + // Break false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); + + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, + Mask); + } default: break; } @@ -22279,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); - else // This function handles the SP or FP case. - Reg = RegInfo->getPtrSizedFrameRegister(MF); + else { // Handles the SP or FP case. + bool CantUseFP = RegInfo->needsStackRealignment(MF); + if (CantUseFP) + Reg = RegInfo->getPtrSizedStackRegister(MF); + else + Reg = RegInfo->getPtrSizedFrameRegister(MF); + } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } + + case Intrinsic::x86_avx512_vp2intersect_q_512: + case Intrinsic::x86_avx512_vp2intersect_q_256: + case Intrinsic::x86_avx512_vp2intersect_q_128: + case Intrinsic::x86_avx512_vp2intersect_d_512: + case Intrinsic::x86_avx512_vp2intersect_d_256: + case Intrinsic::x86_avx512_vp2intersect_d_128: { + MVT MaskVT = Op.getSimpleValueType(); + + SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); + SDLoc DL(Op); + + SDValue Operation = + DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, + Op->getOperand(1), Op->getOperand(2)); + + SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, + MaskVT, Operation); + SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, + MaskVT, Operation); + return DAG.getMergeValues({Result0, Result1}, DL); + } } } @@ -22296,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = Mask.getValueType(); + EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } -static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { +static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast<ConstantSDNode>(ScaleOp); @@ -22332,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22355,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -22366,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - return SDValue(Res, 1); + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return Res.getValue(1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22392,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } -/// Handles the lowering of builtin intrinsic that return the value -/// of the extended control register. -static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl<SDValue> &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; +/// Handles the lowering of builtin intrinsics with chain that return their +/// value into registers EDX:EAX. +/// If operand ScrReg is a valid register identifier, then operand 2 of N is +/// copied to SrcReg. The assumption is that SrcReg is an implicit input to +/// TargetOpcode. +/// Returns a Glue value which can be used to add extra copy-from-reg if the +/// expanded intrinsics implicitly defines extra registers (i.e. not just +/// EDX:EAX). +static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + unsigned TargetOpcode, + unsigned SrcReg, + const X86Subtarget &Subtarget, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Glue; - // The ECX register is used to select the index of the XCR register to - // return. - SDValue Chain = - DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); - SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain); + if (SrcReg) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); + Glue = Chain.getValue(1); + } + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue N1Ops[] = {Chain, Glue}; + SDNode *N1 = DAG.getMachineNode( + TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1)); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. + SDValue LO, HI; if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, @@ -22420,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, LO.getValue(2)); } Chain = HI.getValue(1); + Glue = HI.getValue(2); if (Subtarget.is64Bit()) { - // Merge the two 32-bit values into a 64-bit one.. + // Merge the two 32-bit values into a 64-bit one. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); - return; - } - - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { LO, HI }; - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); - Results.push_back(Pair); - Results.push_back(Chain); -} - -/// Handles the lowering of builtin intrinsics that read performance monitor -/// counters (x86_rdpmc). -static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl<SDValue> &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; - - // The ECX register is used to select the index of the performance counter - // to read. - SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, - N->getOperand(2)); - SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); - - // Reads the content of a 64-bit performance counter and returns it in the - // registers EDX:EAX. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - Chain = HI.getValue(1); - - if (Subtarget.is64Bit()) { - // The EAX register is loaded with the low-order 32 bits. The EDX register - // is loaded with the supported high-order bits of the counter. - SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); - Results.push_back(Chain); - return; + return Glue; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. @@ -22481,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); + return Glue; } /// Handles the lowering of builtin intrinsics that read the time stamp counter @@ -22490,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl<SDValue> &Results) { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); - SDValue LO, HI; - // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - SDValue Chain = HI.getValue(1); - - SDValue TSC; - if (Subtarget.is64Bit()) { - // The EDX register is loaded with the high-order 32 bits of the MSR, and - // the EAX register is loaded with the low-order 32 bits. - TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC); - } else { - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI }); - } - - if (Opcode == X86ISD::RDTSCP_DAG) { - assert(N->getNumOperands() == 2 && "Unexpected number of operands!"); - - // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into - // the ECX register. Add 'ecx' explicitly to the chain. - SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, - HI.getValue(2)); - - Results.push_back(TSC); - Results.push_back(ecx); - Results.push_back(ecx.getValue(1)); + SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, + /* NoRegister */0, Subtarget, + Results); + if (Opcode != X86::RDTSCP) return; - } - Results.push_back(TSC); - Results.push_back(Chain); + SDValue Chain = Results[1]; + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); + Results[1] = ecx; + Results.push_back(ecx.getValue(1)); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<SDValue, 3> Results; SDLoc DL(Op); - getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } @@ -22621,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); + case llvm::Intrinsic::x86_rdpkru: { + SDLoc dl(Op); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + // Create a RDPKRU node and pass 0 to the ECX parameter. + return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + } + case llvm::Intrinsic::x86_wrpkru: { + SDLoc dl(Op); + // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 + // to the EDX and ECX parameters. + return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(2), + DAG.getConstant(0, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + } case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: @@ -22630,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later - // during ExpandISelPseudos in EmitInstrWithCustomInserter. + // during FinalizeISel in EmitInstrWithCustomInserter. return SDValue(); } case Intrinsic::x86_lwpins32: @@ -22660,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); - SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, + Operation.getValue(1)); + } + case Intrinsic::x86_enqcmd: + case Intrinsic::x86_enqcmds: { + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic!"); + case Intrinsic::x86_enqcmd: + Opcode = X86ISD::ENQCMD; + break; + case Intrinsic::x86_enqcmds: + Opcode = X86ISD::ENQCMDS; + break; + } + SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), + Op.getOperand(3)); + SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } } @@ -22707,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -22743,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. - case RDPMC: { - SmallVector<SDValue, 2> Results; - getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); - return DAG.getMergeValues(Results, dl); - } - // Get Extended Control Register. + case RDPMC: + // GetExtended Control Register. case XGETBV: { SmallVector<SDValue, 2> Results; - getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results); + + // RDPMC uses ECX to select the index of the performance counter to read. + // XGETBV uses ECX to select the index of the XCR register to return. + // The result is stored into registers EDX:EAX. + expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, + Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. @@ -22861,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( - SlotSize, /*Offset=*/0, /*IsImmutable=*/false); + SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); @@ -23444,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); - // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); - assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); @@ -23539,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, return split256IntArith(Op, DAG); } -static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); if (VT.getScalarType() == MVT::i1) { SDLoc dl(Op); - switch (Op.getOpcode()) { + switch (Opcode) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: case ISD::SADDSAT: - return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); + // *addsat i1 X, Y --> X | Y + return DAG.getNode(ISD::OR, dl, VT, X, Y); case ISD::USUBSAT: case ISD::SSUBSAT: - return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), - DAG.getNOT(dl, Op.getOperand(1), VT)); + // *subsat i1 X, Y --> X & ~Y + return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT)); } } + if (VT.is128BitVector()) { + // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), VT); + SDLoc DL(Op); + if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) { + // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add); + } + if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { + // usubsat X, Y --> (X >u Y) ? X - Y : 0 + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + } + // Use default expansion. + return SDValue(); + } + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -23886,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Signed AVX2 implementation - extend xmm subvectors to ymm. if (VT == MVT::v32i8 && IsSigned) { - SDValue Lo = DAG.getIntPtrConstant(0, dl); - SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl); - MVT ExVT = MVT::v16i16; SDValue ALo = extract128BitVector(A, 0, DAG, dl); SDValue BLo = extract128BitVector(B, 0, DAG, dl); @@ -23898,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); - Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); - Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); @@ -24156,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, APInt APIntShiftAmt; if (!isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); + + // If the shift amount is out of range, return undef. + if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) + return DAG.getUNDEF(VT); + uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -24197,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); + APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); + return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -24224,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -// If V is a splat value, return the source vector and splat index; -static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) { - V = peekThroughEXTRACT_SUBVECTORs(V); - - EVT VT = V.getValueType(); - unsigned Opcode = V.getOpcode(); - switch (Opcode) { - default: { - APInt UndefElts; - APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); - if (DAG.isSplatValue(V, DemandedElts, UndefElts)) { - // Handle case where all demanded elements are UNDEF. - if (DemandedElts.isSubsetOf(UndefElts)) { - SplatIdx = 0; - return DAG.getUNDEF(VT); - } - SplatIdx = (UndefElts & DemandedElts).countTrailingOnes(); - return V; - } - break; - } - case ISD::VECTOR_SHUFFLE: { - // Check if this is a shuffle node doing a splat. - // TODO - remove this and rely purely on SelectionDAG::isSplatValue, - // getTargetVShiftNode currently struggles without the splat source. - auto *SVN = cast<ShuffleVectorSDNode>(V); - if (!SVN->isSplat()) - break; - int Idx = SVN->getSplatIndex(); - int NumElts = V.getValueType().getVectorNumElements(); - SplatIdx = Idx % NumElts; - return V.getOperand(Idx / NumElts); - } - } - - return SDValue(); -} - -static SDValue GetSplatValue(SDValue V, const SDLoc &dl, - SelectionDAG &DAG) { - int SplatIdx; - if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG)) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - SrcVector.getValueType().getScalarType(), SrcVector, - DAG.getIntPtrConstant(SplatIdx, dl)); - return SDValue(); -} - static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -24282,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); - if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) { + if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) { if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { MVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); @@ -25102,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b - else if (OpWidth == 128) + return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit(); + if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); - else - return false; + + return false; } +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - return needsCmpXchgNb(SI->getValueOperand()->getType()); + Type *MemType = SI->getValueOperand()->getType(); + + bool NoImplicitFloatOps = + SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return false; + + return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. -// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - auto PTy = cast<PointerType>(LI->getPointerOperandType()); - return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg - : AtomicExpansionKind::None; + Type *MemType = LI->getType(); + + // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we + // can use movq to do the load. If we have X87 we can load into an 80-bit + // X87 register and store it to a stack temporary. + bool NoImplicitFloatOps = + LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && + (Subtarget.hasSSE2() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind @@ -25155,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; @@ -25171,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); - auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence @@ -25212,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. - LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, - AI->getType()->getPrimitiveSizeInBits()); + LoadInst *Loaded = + Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(), + AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) When choosing offsets, there are several contributing factors: + // a) If there's no redzone, we default to TOS. (We could allocate a cache + // line aligned stack object to improve this case.) + // b) To minimize our chances of introducing a false dependence, we prefer + // to offset the stack usage from TOS slightly. + // c) To minimize concerns about cross thread stack usage - in particular, + // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which + // captures state in the TOS frame and accesses it from many threads - + // we want to use an offset such that the offset is in a distinct cache + // line from the TOS frame. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + auto &MF = DAG.getMachineFunction(); + auto &TFL = *Subtarget.getFrameLowering(); + const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -25235,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, dl, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, dl, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. - Zero, - Chain - }; - SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops); - return SDValue(Res, 0); + SDValue Chain = Op.getOperand(0); + return emitLockedStackOp(DAG, Subtarget, Chain, dl); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. @@ -25288,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); - return SDValue(); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + cpOut, Success, EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. @@ -25703,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -25717,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -25727,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); - DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); - return SDValue(); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), LockOp.getValue(1)); } -static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + auto *Node = cast<AtomicSDNode>(Op.getNode()); SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + EVT VT = Node->getMemoryVT(); + + bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; + bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); + + // If this store is not sequentially consistent and the type is legal + // we can just keep it. + if (!IsSeqCst && IsTypeLegal) + return Op; + + if (VT == MVT::i64 && !IsTypeLegal) { + // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. + // FIXME: Use movlps with SSE1. + // FIXME: Use fist with X87. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, + Ops, MVT::i64, + Node->getMemOperand()); + + // If this is a sequentially consistent store, also emit an appropriate + // barrier. + if (IsSeqCst) + Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); + + return Chain; + } + } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) - // FIXME: On 32-bit, store -> fist or movq would be more efficient - // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast<AtomicSDNode>(Node)->getOrdering() == - AtomicOrdering::SequentiallyConsistent || - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - cast<AtomicSDNode>(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), - cast<AtomicSDNode>(Node)->getMemOperand()); - return Swap.getValue(1); - } - // Other atomic stores have a simple pattern. - return Op; + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + Node->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + Node->getMemOperand()); + return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { @@ -25919,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } return SDValue(); @@ -25935,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } // Custom widen all the operands to avoid promotion. @@ -25980,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } @@ -25991,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); + MVT MaskVT = Mask.getSimpleValueType(); + SDValue PassThru = N->getPassThru(); SDLoc dl(Op); + // Handle AVX masked loads which don't support passthru other than 0. + if (MaskVT.getVectorElementType() != MVT::i1) { + // We also allow undef in the isel pattern. + if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) + return Op; + + SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), + N->getBasePtr(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType(), + N->isExpandingLoad()); + // Emit a blend. + SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, + PassThru); + return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); + } + assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); @@ -26011,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG); + PassThru = ExtendToType(PassThru, WideDataVT, DAG); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && @@ -26179,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -26272,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: - case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG); + case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -26301,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N, if (!Res.getNode()) return; - assert((N->getNumValues() <= Res->getNumValues()) && + // If the original node has one result, take the return value from + // LowerOperation as is. It might not be result number 0. + if (N->getNumValues() == 1) { + Results.push_back(Res); + return; + } + + // If the original node has multiple results, then the return node should + // have the same number of results. + assert((N->getNumValues() == Res->getNumValues()) && "Lowering returned the wrong number of results!"); // Places new result values base on N result number. - // In some cases (LowerSINT_TO_FP for example) Res has more result values - // than original node, chain should be dropped(last value). for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) Results.push_back(Res.getValue(I)); } @@ -26319,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: +#ifndef NDEBUG + dbgs() << "ReplaceNodeResults: "; + N->dump(&DAG); +#endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::CTPOP: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + // Use a v2i64 if possible. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { + SDValue Wide = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); + Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); + // Bit count should fit in 32-bits, extract it as that and then zero + // extend to i64. Otherwise we end up extracting bits 63:32 separately. + Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); + Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, + DAG.getIntPtrConstant(0, dl)); + Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); + Results.push_back(Wide); + } + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(VT.isVector() && "Unexpected VT"); @@ -26385,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } + case ISD::ABS: { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + assert(N->getValueType(0) == MVT::i64 && + "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDValue Lo, Hi, Tmp; + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + Tmp = DAG.getNode( + ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(HalfT.getSizeInBits() - 1, dl, + TLI.getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + Results.push_back(Lo); + Results.push_back(Hi); + return; + } case ISD::SETCC: { // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when // setCC result type is v2i1 because type legalzation will end up with @@ -26557,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { - if (!ExperimentalVectorWideningLegalization) - return; - EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)) { + (InVT == MVT::v4i16 || InVT == MVT::v4i8) && + getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { + assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting @@ -26589,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) { + if (VT == MVT::v16i32 || VT == MVT::v8i64) { + if (!InVT.is128BitVector()) { + // Not a 128 bit vector, but maybe type legalization will promote + // it to 128 bits. + if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) + return; + InVT = getTypeToTransformTo(*DAG.getContext(), InVT); + if (!InVT.is128BitVector()) + return; + + // Promote the input to 128 bits. Type legalization will turn this into + // zext_inreg/sext_inreg. + In = DAG.getNode(N->getOpcode(), dl, InVT, In); + } + // Perform custom splitting instead of the two stage extend we would get // by default. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); - bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND; - - SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG); + SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); @@ -26608,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); - Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG); + Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); @@ -26735,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - std::pair<SDValue,SDValue> Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); - SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode()) { - // Return a load from the stack slot. - if (StackSlot.getNode()) - Results.push_back( - DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo())); - else - Results.push_back(FIST); - } + if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) + Results.push_back(V); return; } case ISD::SINT_TO_FP: { @@ -26800,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: - return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); - + expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, + Results); + return; case Intrinsic::x86_xgetbv: - return getExtendedControlRegister(N, dl, DAG, Subtarget, Results); + expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, + Results); + return; } } - case ISD::INTRINSIC_WO_CHAIN: { - if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)) - Results.push_back(V); - return; - } case ISD::READCYCLECOUNTER: { - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, - Results); + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; + assert((!Regs64bit || Subtarget.hasCmpxchg16b()) && + "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), @@ -26903,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_LOAD: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { + auto *Node = cast<AtomicSDNode>(N); + if (Subtarget.hasSSE2()) { + // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the + // lower 64-bits. + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Node->getMemOperand()); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Ld.getValue(1)); + return; + } + if (Subtarget.hasX87()) { + // First load this into an 80-bit X87 register. This will put the whole + // integer into the significand. + // FIXME: Do we need to glue? See FIXME comment in BuildFILD. + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG, + dl, Tys, Ops, MVT::i64, + Node->getMemOperand()); + SDValue Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // Now store the X87 register to a stack temporary and convert to i64. + // This store is not atomic and doesn't need to be. + // FIXME: We don't need a stack temporary if the result of the load + // is already being stored. We could just directly store there. + SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); + int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag }; + Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, + DAG.getVTList(MVT::Other), StoreOps, + MVT::i64, MPI, 0 /*Align*/, + MachineMemOperand::MOStore); + + // Finally load the value back from the stack temporary and return it. + // This load is not atomic and doesn't need to be. + // This load will be further type legalized. + Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + return; + } + } + // TODO: Use MOVLPS when SSE1 is available? + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by AtomicExpandPass.cpp. + break; + } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: @@ -26914,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; - } + case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); @@ -27061,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); - MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; - SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), - Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + if (Subtarget.hasSSE2()) { + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Ld->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } } @@ -27092,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; - case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; - case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; - case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FIST: return "X86ISD::FIST"; + case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; - case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; - case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; - case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; - case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; + case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; - case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND"; + case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -27140,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAXS: return "X86ISD::FMAXS"; - case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; - case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE"; + case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMINS: return "X86ISD::FMINS"; - case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; - case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; + case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE"; + case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -27177,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -27188,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; - case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; - case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; + case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; + case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; + case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; + case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; @@ -27202,6 +28320,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VSHLV: return "X86ISD::VSHLV"; + case X86ISD::VSRLV: return "X86ISD::VSRLV"; case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; @@ -27263,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE"; case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; + case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; - case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND"; + case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE"; case X86ISD::VRANGES: return "X86ISD::VRANGES"; - case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND"; + case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; @@ -27281,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::RDPKRU: return "X86ISD::RDPKRU"; + case X86ISD::WRPKRU: return "X86ISD::WRPKRU"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; @@ -27302,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; - case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND"; + case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; - case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND"; + case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; - case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND"; + case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; - case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND"; + case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; - case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; + case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; - case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; + case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE"; case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -27323,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RCP14: return "X86ISD::RCP14"; case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; + case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE"; case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; + case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS: return "X86ISD::FADDS"; case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS: return "X86ISD::FSUBS"; case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS: return "X86ISD::FMULS"; case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS: return "X86ISD::FDIVS"; case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FSQRTS: return "X86ISD::FSQRTS"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; - case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; - case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; + case X86ISD::FGETEXP: return "X86ISD::FGETEXP"; + case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE"; + case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS"; + case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; + case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; + case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; @@ -27351,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; - case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND"; - case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND"; + case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; + case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE"; case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI"; case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI"; - case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND"; - case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND"; + case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE"; + case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; + case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; + case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; + case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; + case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; - case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; + case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI"; @@ -27378,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; + case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; + case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; + case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; + case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; @@ -27393,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND"; case X86ISD::UMWAIT: return "X86ISD::UMWAIT"; case X86ISD::TPAUSE: return "X86ISD::TPAUSE"; + case X86ISD::ENQCMD: return "X86ISD:ENQCMD"; + case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS"; + case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT"; } return nullptr; } @@ -27478,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { return true; } +bool X86TargetLowering::isBinOp(unsigned Opcode) const { + switch (Opcode) { + // These are non-commutative binops. + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::ANDNP: + case X86ISD::PCMPGT: + case X86ISD::FMAX: + case X86ISD::FMIN: + case X86ISD::FANDN: + return true; + } + + return TargetLoweringBase::isBinOp(Opcode); +} + +bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { + switch (Opcode) { + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::PCMPEQ: + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: + case X86ISD::FMAXC: + case X86ISD::FMINC: + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + return true; + } + + return TargetLoweringBase::isCommutativeBinOp(Opcode); +} + bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -27713,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, return sinkMBB; } -static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert input VAL into EAX - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI.getOperand(0).getReg()); - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert zero to EDX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); - - // insert WRPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert RDPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::EAX); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget, - unsigned Opc) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX, other two args into ECX, EDX. - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - - unsigned ValOps = X86::AddrNumOperands; - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI.getOperand(ValOps).getReg()); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) - .addReg(MI.getOperand(ValOps + 1).getReg()); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(Opc)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI->getOperand(i)); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); - - MI->eraseFromParent(); // The pseudo is gone now. - return BB; -} - MachineBasicBlock * @@ -27823,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, unsigned ArgMode = MI.getOperand(7).getImm(); unsigned Align = MI.getOperand(8).getImm(); + MachineFunction *MF = MBB->getParent(); + // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); - SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(), - MI.memoperands_end()); + + MachineMemOperand *OldMMO = MI.memoperands().front(); + + // Clone the MMO into two separate MMOs for loading and storing + MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); + MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -27891,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -27924,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -27933,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise - BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) - .addMBB(overflowMBB); + BuildMI(thisMBB, DL, TII->get(X86::JCC_1)) + .addMBB(overflowMBB).addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. @@ -27949,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 16) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -27977,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -27996,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 8) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -28033,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -28091,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); MBB->addSuccessor(EndMBB); } @@ -28371,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); - unsigned Opc = X86::GetCondBranchFromCond(FirstCC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); - unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC); - BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB); + BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] @@ -28463,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; - MachineBasicBlock::iterator NextMIIt = - std::next(MachineBasicBlock::iterator(MI)); + MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { - // See if we have a string of CMOVS with the same condition. + // See if we have a string of CMOVS with the same condition. Skip over + // intervening debug insts. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; + NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end()); } } @@ -28508,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, SinkMBB->addLiveIn(X86::EFLAGS); } + // Transfer any debug instructions inside the CMOV sequence to the sunk block. + auto DbgEnd = MachineBasicBlock::iterator(LastCMOV); + auto DbgIt = MachineBasicBlock::iterator(MI); + while (DbgIt != DbgEnd) { + auto Next = std::next(DbgIt); + if (DbgIt->isDebugInstr()) + SinkMBB->push_back(DbgIt->removeFromParent()); + DbgIt = Next; + } + // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. - SinkMBB->splice(SinkMBB->begin(), ThisMBB, + SinkMBB->splice(SinkMBB->end(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); @@ -28522,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. - unsigned Opc = X86::GetCondBranchFromCond(CC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] @@ -28540,53 +29655,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, - MachineBasicBlock *BB) const { - // Combine the following atomic floating-point modification pattern: - // a.store(reg OP a.load(acquire), release) - // Transform them into: - // OPss (%gpr), %xmm - // movss %xmm, (%gpr) - // Or sd equivalent for 64-bit operations. - unsigned MOp, FOp; - switch (MI.getOpcode()) { - default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); - case X86::RELEASE_FADD32mr: - FOp = X86::ADDSSrm; - MOp = X86::MOVSSmr; - break; - case X86::RELEASE_FADD64mr: - FOp = X86::ADDSDrm; - MOp = X86::MOVSDmr; - break; - } - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned ValOpIdx = X86::AddrNumOperands; - unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(FOp), - MRI.createVirtualRegister(MRI.getRegClass(VSrc))) - .addReg(VSrc); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand &Operand = MI.getOperand(i); - // Clear any kill flags on register operands as we'll create a second - // instruction using the same address operands. - if (Operand.isReg()) - Operand.setIsKill(false); - MIB.add(Operand); - } - MachineInstr *FOpMI = MIB; - MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); - MI.eraseFromParent(); // The pseudo instruction is gone now. - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -28652,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -29279,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(checkSspMBB, DL, TII->get(TestRROpc)) .addReg(SSPCopyReg) .addReg(SSPCopyReg); - BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); checkSspMBB->addSuccessor(sinkMBB); checkSspMBB->addSuccessor(fallMBB); @@ -29309,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addReg(SSPCopyReg); // Jump to sink in case PrevSSPReg <= SSPCopyReg. - BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB); + BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE); fallMBB->addSuccessor(sinkMBB); fallMBB->addSuccessor(fixShadowMBB); @@ -29332,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addImm(8); // Jump if the result of the shift is zero. - BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); fixShadowMBB->addSuccessor(sinkMBB); fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); @@ -29367,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg); // Jump if the counter is not zero yet. - BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB); + BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE); fixShadowLoopMBB->addSuccessor(sinkMBB); fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); @@ -29512,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - int FI = MFI.getFunctionContextIndex(); + int FI = MF->getFrameInfo().getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -29613,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); - BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB); + BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); @@ -29766,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: + case X86::CMOV_FR32X: case X86::CMOV_FR64: + case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -29821,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } - case X86::RELEASE_FADD32mr: - case X86::RELEASE_FADD64mr: - return EmitLoweredAtomicFP(MI, BB); - case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -29836,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FNSTCW16m)), CWFrameIdx); + TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); - // Load the old value of the high byte of the control word... + // Load the old value of the control word... unsigned OldCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), + OrigCWFrameIdx); + + // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. + unsigned NewCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) + .addReg(OldCW, RegState::Kill).addImm(0xC00); + + // Extract to 16 bits. + unsigned NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), - CWFrameIdx); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) + .addReg(NewCW, RegState::Kill, X86::sub_16bit); - // Set the high part to be round to zero... - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) - .addImm(0xC7F); + // Prepare memory for FLDCW. + int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), + NewCWFrameIdx) + .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); - - // Restore the memory image of control word to original value - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) - .addReg(OldCW); + TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; @@ -29879,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); + TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // Thread synchronization. - case X86::MONITOR: - return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); - case X86::MONITORX: - return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); - - // Cache line zero - case X86::CLZERO: - return emitClzero(&MI, BB, Subtarget); - - // PKU feature - case X86::WRPKRU: - return emitWRPKRU(MI, BB, Subtarget); - case X86::RDPKRU: - return emitRDPKRU(MI, BB, Subtarget); + // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); @@ -30093,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); - Known = Known.zextOrTrunc(BitWidth); + Known = Known.zextOrTrunc(BitWidth, false); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } @@ -30150,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.trunc(BitWidth); break; } + case X86ISD::ANDNP: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // ANDNP = (~X & Y); + Known.One &= Known2.Zero; + Known.Zero |= Known2.One; + break; + } + case X86ISD::FOR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // Output known-0 bits are only known if clear in both the LHS & RHS. + Known.Zero &= Known2.Zero; + // Output known-1 are known to be set if set in either the LHS | RHS. + Known.One |= Known2.One; + break; + } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); // If we don't know any bits, early out. @@ -30219,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned VTBits = Op.getScalarValueSizeInBits(); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: @@ -30257,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + const APInt &ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30268,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + APInt ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30284,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( // Vector compares return zero/all-bits result values. return VTBits; + case X86ISD::ANDNP: { + unsigned Tmp0 = + DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Tmp0 == 1) return 1; // Early out. + unsigned Tmp1 = + DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); + return std::min(Tmp0, Tmp1); + } + case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. @@ -30292,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( } } + // Handle target shuffles. + // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. + if (isTargetShuffle(Opcode)) { + bool IsUnary; + SmallVector<int, 64> Mask; + SmallVector<SDValue, 2> Ops; + if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, + IsUnary)) { + unsigned NumOps = Ops.size(); + unsigned NumElts = VT.getVectorNumElements(); + if (Mask.size() == NumElts) { + SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0)); + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + int M = Mask[i]; + if (M == SM_SentinelUndef) { + // For UNDEF elements, we don't know anything about the common state + // of the shuffle result. + return 1; + } else if (M == SM_SentinelZero) { + // Zero = all sign bits. + continue; + } + assert(0 <= M && (unsigned)M < (NumOps * NumElts) && + "Shuffle index out of range"); + + unsigned OpIdx = (unsigned)M / NumElts; + unsigned EltIdx = (unsigned)M % NumElts; + if (Ops[OpIdx].getValueType() != VT) { + // TODO - handle target shuffle ops with different value types. + return 1; + } + DemandedOps[OpIdx].setBit(EltIdx); + } + unsigned Tmp0 = VTBits; + for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { + if (!DemandedOps[i]) + continue; + unsigned Tmp1 = + DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); + Tmp0 = std::min(Tmp0, Tmp1); + } + return Tmp0; + } + } + } + // Fallback case. return 1; } @@ -30305,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const { // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { +static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, unsigned &Shuffle, + MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -30322,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, return true; } - // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { - bool Match = true; + bool MatchAny = true; + bool MatchZero = true; unsigned NumDstElts = NumMaskElts / Scale; - for (unsigned i = 0; i != NumDstElts && Match; ++i) { - Match &= isUndefOrEqual(Mask[i * Scale], (int)i); - Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); + for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) { + if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { + MatchAny = MatchZero = false; + break; + } + MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1); + MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } - if (Match) { + if (MatchAny || MatchZero) { + assert(MatchZero && "Failed to match zext but matched aext?"); unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); @@ -30343,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - if (SrcVT.getVectorNumElements() == NumDstElts) - Shuffle = unsigned(ISD::ZERO_EXTEND); - else - Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); + Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); + if (SrcVT.getVectorNumElements() != NumDstElts) + Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); @@ -30368,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { + if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -30426,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - // Attempt to match against broadcast-from-vector. - if (Subtarget.hasAVX2()) { - SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); - if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - SrcVT = DstVT = MaskVT; - Shuffle = X86ISD::VBROADCAST; - return true; - } - } - return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - const APInt &Zeroable, - bool AllowFloatDomain, - bool AllowIntDomain, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT, - unsigned &PermuteImm) { +static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, + bool AllowFloatDomain, bool AllowIntDomain, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; @@ -30549,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // FIXME: Add 512-bit support. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskScalarSizeInBits, Mask, - 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, + Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; return true; @@ -30564,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, SDValue &V2, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, - bool IsUnary) { +static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, + bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -30631,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, return false; } -static bool matchBinaryPermuteVectorShuffle( +static bool matchBinaryPermuteShuffle( MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, @@ -30642,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle( // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); + int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); @@ -30678,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle( return true; } } else { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = - scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); - } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; return true; } } @@ -30715,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle( if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { if (Zeroable.getBoolValue() && - matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; @@ -30727,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; @@ -30784,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle( return false; } +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget); + /// Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// @@ -30841,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + // Attempt to match a subvector broadcast. + // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) + if (UnaryShuffle && + (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) { + SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0); + if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) { + SDValue Src = Inputs[0]; + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(0).isUndef() && + Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits && + MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) { + return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL, + Src.getValueType(), + Src.getOperand(1))); + } + } + } + // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. @@ -30894,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. + // TODO: Should we indicate which domain is preferred if both are allowed? bool AllowFloatDomain = FloatDomain || (Depth > 3); bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); @@ -30909,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && - (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() % + MaskEltSizeInBits) == 0) { + unsigned Scale = + cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() / + MaskEltSizeInBits; ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { @@ -30918,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + // Attempt to match against broadcast-from-vector. + // Limit AVX1 to cases where we're loading+broadcasting a scalar element. + if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) + && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); + if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { + if (V1.getValueType() == MaskVT && + V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + MayFoldLoad(V1.getOperand(0))) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = V1.getOperand(0); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + if (Subtarget.hasAVX2()) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = DAG.getBitcast(MaskVT, V1); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + } + } + SDValue NewV1 = V1; // Save operand in case early exit happens. - if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT) && + if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30930,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, Subtarget, Shuffle, - ShuffleVT, PermuteImm) && + if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, ShuffleVT, + PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30945,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; - if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, NewV2, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT, UnaryShuffle) && + if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30959,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; - if (matchBinaryPermuteVectorShuffle( + if (matchBinaryPermuteShuffle( MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { @@ -30979,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, - Zeroable)) { + if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -30990,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -31057,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && @@ -31222,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + + // If we have a dual input shuffle then lower to VPERMV3. + if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 || + MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) { + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); + V1 = DAG.getBitcast(MaskVT, V1); + V2 = DAG.getBitcast(MaskVT, V2); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); + return DAG.getBitcast(RootVT, Res); + } + // Failed to find any combines. return SDValue(); } +// Combine an arbitrary chain of shuffles + extract_subvectors into a single +// instruction if possible. +// +// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger +// type size to attempt to combine: +// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) +// --> +// extract_subvector(shuffle(x,y,m2),0) +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned NumMaskElts = BaseMask.size(); + unsigned NumInputs = Inputs.size(); + if (NumInputs == 0) + return SDValue(); + + SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end()); + SmallVector<unsigned, 4> Offsets(NumInputs, 0); + + // Peek through subvectors. + // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs? + unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits(); + for (unsigned i = 0; i != NumInputs; ++i) { + SDValue &Src = WideInputs[i]; + unsigned &Offset = Offsets[i]; + Src = peekThroughBitcasts(Src); + EVT BaseVT = Src.getValueType(); + while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isa<ConstantSDNode>(Src.getOperand(1))) { + Offset += Src.getConstantOperandVal(1); + Src = Src.getOperand(0); + } + WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); + assert((Offset % BaseVT.getVectorNumElements()) == 0 && + "Unexpected subvector extraction"); + Offset /= BaseVT.getVectorNumElements(); + Offset *= NumMaskElts; + } + + // Bail if we're always extracting from the lowest subvectors, + // combineX86ShuffleChain should match this for the current width. + if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; })) + return SDValue(); + + EVT RootVT = Root.getValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned Scale = WideSizeInBits / RootSizeInBits; + assert((WideSizeInBits % RootSizeInBits) == 0 && + "Unexpected subvector extraction"); + + // If the src vector types aren't the same, see if we can extend + // them to match each other. + // TODO: Support different scalar types? + EVT WideSVT = WideInputs[0].getValueType().getScalarType(); + if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) { + return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) || + Op.getValueType().getScalarType() != WideSVT; + })) + return SDValue(); + + for (SDValue &NewInput : WideInputs) { + assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && + "Shuffle vector size mismatch"); + if (WideSizeInBits > NewInput.getValueSizeInBits()) + NewInput = widenSubVector(NewInput, false, Subtarget, DAG, + SDLoc(NewInput), WideSizeInBits); + assert(WideSizeInBits == NewInput.getValueSizeInBits() && + "Unexpected subvector extraction"); + } + + // Create new mask for larger type. + for (unsigned i = 1; i != NumInputs; ++i) + Offsets[i] += i * Scale * NumMaskElts; + + SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end()); + for (int &M : WideMask) { + if (M < 0) + continue; + M = (M % NumMaskElts) + Offsets[M / NumMaskElts]; + } + WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); + + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(WideInputs, WideMask); + assert(!WideInputs.empty() && "Shuffle with no inputs detected"); + + if (WideInputs.size() > 2) + return SDValue(); + + // Increase depth for every upper subvector we've peeked through. + Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; }); + + // Attempt to combine wider chain. + // TODO: Can we use a better Root? + SDValue WideRoot = WideInputs[0]; + if (SDValue WideShuffle = combineX86ShuffleChain( + WideInputs, WideRoot, WideMask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget)) { + WideShuffle = + extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); + return DAG.getBitcast(RootVT, WideShuffle); + } + return SDValue(); +} + // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. @@ -31370,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively( if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); - // TODO - Add support for more than 2 inputs. - if (2 < OpInputs.size()) - return SDValue(); - - SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); - SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); - // Add the inputs to the Ops list, avoiding duplicates. SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { - if (!Input) - return -1; // Attempt to find an existing match. SDValue InputBC = peekThroughBitcasts(Input); for (int i = 0, e = Ops.size(); i < e; ++i) @@ -31398,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively( return Ops.size() - 1; }; - int InputIdx0 = AddOp(Input0, SrcOpIndex); - int InputIdx1 = AddOp(Input1, -1); + SmallVector<int, 2> OpInputIdx; + for (SDValue OpInput : OpInputs) + OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || @@ -31471,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively( : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); - if (OpMask[OpIdx] < (int)OpMask.size()) { - assert(0 <= InputIdx0 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx0 * MaskWidth; - } else { - assert(0 <= InputIdx1 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx1 * MaskWidth; - } + int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); + assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); + OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; Mask[i] = OpMaskedIdx; } @@ -31493,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused shuffle source ops. + // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); @@ -31530,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively( return Cst; // We can only combine unary and binary shuffle mask cases. - if (Ops.size() > 2) - return SDValue(); + if (Ops.size() <= 2) { + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with sequential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 64> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + } + + // Canonicalization of binary shuffle masks to improve pattern matching by + // commuting the inputs. + if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + } - // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with sequential pairs of - // elements, and shrink them to the half-width mask. It does this in a loop - // so it will reduce the size of the mask to the minimal width mask which - // performs an equivalent shuffle. - SmallVector<int, 64> WidenedMask; - while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { - Mask = std::move(WidenedMask); + // Finally, try to combine into a single shuffle instruction. + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget); } - // Canonicalization of binary shuffle masks to improve pattern matching by - // commuting the inputs. - if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { - ShuffleVectorSDNode::commuteMask(Mask); - std::swap(Ops[0], Ops[1]); - } + // If that failed and any input is extracted then try to combine as a + // shuffle with the larger type. + return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, + HasVariableMask, AllowVariableMask, + DAG, Subtarget); +} - // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget); +/// Helper entry wrapper to combineX86ShufflesRecursively. +static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, + /*AllowVarMask*/ true, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. @@ -31770,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, switch (Opcode) { case X86ISD::VBROADCAST: { - // If broadcasting from another shuffle, attempt to simplify it. - // TODO - we really need a general SimplifyDemandedVectorElts mechanism. SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); EVT SrcVT = Src.getValueType(); EVT BCVT = BC.getValueType(); + + // If broadcasting from another shuffle, attempt to simplify it. + // TODO - we really need a general SimplifyDemandedVectorElts mechanism. if (isTargetShuffle(BC.getOpcode()) && VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); @@ -31789,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } + + // broadcast(bitcast(src)) -> bitcast(broadcast(src)) + // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. + if (Src.getOpcode() == ISD::BITCAST && + SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) { + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), + VT.getVectorNumElements()); + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); + } + + // Reduce broadcast source vector to lowest 128-bits. + if (SrcVT.getSizeInBits() > 128) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + extract128BitVector(Src, 0, DAG, DL)); + + // broadcast(scalar_to_vector(x)) -> broadcast(x). + if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + + // Share broadcast with the longest vector and extract low subvector (free). + for (SDNode *User : Src->uses()) + if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + return extractSubVector(SDValue(User, 0), 0, DAG, DL, + VT.getSizeInBits()); + } + + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && + SrcVT.getScalarSizeInBits() >= 32) { + unsigned Mask = N.getConstantOperandVal(2); + unsigned Size = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant(ScaleMask, DL, MVT::i8))); + } + } + return SDValue(); + } + case X86ISD::VPERMI: { + // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. + // TODO: Remove when we have preferred domains in combineX86ShuffleChain. + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); + return DAG.getBitcast(VT, Res); + } return SDValue(); } case X86ISD::PSHUFD: @@ -32212,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N) { - if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) - return SDValue(); + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) + if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); + + // For a broadcast, peek through an extract element of index 0 to find the + // horizontal op: broadcast (ext_vec_elt HOp, 0) + EVT VT = N->getValueType(0); + if (Opcode == X86ISD::VBROADCAST) { + SDValue SrcOp = N->getOperand(0); + if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + SrcOp.getValueType() == MVT::f64 && + SrcOp.getOperand(0).getValueType() == VT && + isNullConstant(SrcOp.getOperand(1))) + N = SrcOp.getNode(); + } SDValue HOp = N->getOperand(0); if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && @@ -32224,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] // ...similarly for v2f64 and v8i16. - // TODO: Handle UNDEF operands. - if (HOp.getOperand(0) != HOp.getOperand(1)) + if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() && + HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); // When the operands of a horizontal math op are identical, the low half of - // the result is the same as the high half. If the shuffle is also replicating - // low and high halves, we don't need the shuffle. + // the result is the same as the high half. If a target shuffle is also + // replicating low and high halves, we don't need the shuffle. + if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { + if (HOp.getScalarValueSizeInBits() == 64) { + // movddup (hadd X, X) --> hadd X, X + // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X + assert((HOp.getValueType() == MVT::v2f64 || + HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && + "Unexpected type for h-op"); + return HOp; + } + return SDValue(); + } + // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, @@ -32252,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { return SDValue(); } +/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the +/// low half of each source vector and does not set any high half elements in +/// the destination vector, narrow the shuffle to half its original size. +static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { + if (!Shuf->getValueType(0).isSimple()) + return SDValue(); + MVT VT = Shuf->getSimpleValueType(0); + if (!VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // See if we can ignore all of the high elements of the shuffle. + ArrayRef<int> Mask = Shuf->getMask(); + if (!isUndefUpperHalf(Mask)) + return SDValue(); + + // Check if the shuffle mask accesses only the low half of each input vector + // (half-index output is 0 or 2). + int HalfIdx1, HalfIdx2; + SmallVector<int, 8> HalfMask(Mask.size() / 2); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || + (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) + return SDValue(); + + // Create a half-width shuffle to replace the unnecessarily wide shuffle. + // The trick is knowing that all of the insert/extract are actually free + // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle + // of narrow inputs into a narrow output, and that is always cheaper than + // the wide shuffle that we started with. + return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), + Shuf->getOperand(1), HalfMask, HalfIdx1, + HalfIdx2, false, DAG); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N)) + if (SDValue V = narrowShuffle(Shuf, DAG)) + return V; + + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; @@ -32328,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, - // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are - // consecutive, non-overlapping, and in the right order. - SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { - Elts.push_back(Elt); - continue; - } - Elts.clear(); - break; - } - - if (Elts.size() == VT.getVectorNumElements()) - if (SDValue LD = - EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) - return LD; + // Attempt to combine into a vector load/broadcast. + if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle <mask> (concat_vectors t1, undef) @@ -32365,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // Simplify source operands based on shuffle mask. @@ -32378,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros + // in the upper 64 bits. + // TODO: Can we generalize this using computeKnownBits. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && + (VT == MVT::v2f64 || VT == MVT::v2i64) && + N->getOperand(0).getOpcode() == ISD::BITCAST && + (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || + N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { + SDValue In = N->getOperand(0).getOperand(0); + switch (In.getOpcode()) { + default: + break; + case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: + case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: + case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: + case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: + case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: + case X86ISD::VFPROUND: case X86ISD::VMFPROUND: + if (In.getOperand(0).getValueType() == MVT::v2f64 || + In.getOperand(0).getValueType() == MVT::v2i64) + return N->getOperand(0); // return the bitcast + break; + } + } + + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && + N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && + N->getOperand(0).hasOneUse() && + N->getOperand(0).getOperand(0).isUndef() && + isNullConstant(N->getOperand(0).getOperand(2))) { + SDValue In = N->getOperand(0).getOperand(1); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), + Movl, N->getOperand(0).getOperand(2)); + } + + // If this a vzmovl of a full vector load, replace it with a vzload, unless + // the load is volatile. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && + ISD::isNormalLoad(N->getOperand(0).getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + if (!LN->isVolatile()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + VT.getVectorElementType(), + LN->getPointerInfo(), + LN->getAlignment(), + MachineMemOperand::MOLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return VZLoad; + } + } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. @@ -32436,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Handle special case opcodes. switch (Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + Depth + 1)) + return true; + // Multiply by zero. + KnownZero = LHSZero | RHSZero; + break; + } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { @@ -32443,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); + + // If we reuse the shift amount just for sse shift amounts then we know that + // only the bottom 64-bits are only ever used. + bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { + unsigned UseOpc = Use->getOpcode(); + return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || + UseOpc == X86ISD::VSRA) && + Use->getOperand(0) != Amt; + }); + APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, - Depth + 1)) + Depth + 1, AssumeSingleUse)) return true; LLVM_FALLTHROUGH; } @@ -32487,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + APInt DemandedLHS, DemandedRHS; + getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + break; + } + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, + Depth + 1)) + return true; + KnownZero = SrcZero.zextOrTrunc(NumElts); + KnownUndef = SrcUndef.zextOrTrunc(NumElts); + break; + } + case X86ISD::BLENDV: { + APInt SelUndef, SelZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, + SelZero, TLO, Depth + 1)) + return true; + + // TODO: Use SelZero to adjust LHS/RHS DemandedElts. + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + + KnownZero = LHSZero & RHSZero; + KnownUndef = LHSUndef & RHSUndef; + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32494,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return false; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { - if(Src.getValueType() != VT) + if (Src.getValueType() != VT) Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, SDLoc(Op)); return TLO.CombineTo(Op, Src); @@ -32506,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } - case X86ISD::PSHUFB: { - // TODO - simplify other variable shuffle masks. + case X86ISD::SUBV_BROADCAST: { + // Reduce size of broadcast if we don't need the upper half. + unsigned HalfElts = NumElts / 2; + if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + SDValue Half = Src; + if (SrcVT.getVectorNumElements() != HalfElts) { + MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts); + Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src); + } + + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0, + TLO.DAG, SDLoc(Op), + Half.getValueSizeInBits())); + } + break; + } + case X86ISD::VPERMV: { + SDValue Mask = Op.getOperand(0); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + case X86ISD::PSHUFB: + case X86ISD::VPERMV3: + case X86ISD::VPERMILPV: { SDValue Mask = Op.getOperand(1); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, @@ -32515,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::VPPERM: + case X86ISD::VPERMIL2: { + SDValue Mask = Op.getOperand(2); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + } + + // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not + // demand any of the high elements, then narrow the op to 128/256-bits: e.g. + // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 + if ((VT.is256BitVector() || VT.is512BitVector()) && + DemandedElts.lshr(NumElts / 2) == 0) { + unsigned SizeInBits = VT.getSizeInBits(); + unsigned ExtSizeInBits = SizeInBits / 2; + + // See if 512-bit ops only use the bottom 128-bits. + if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) + ExtSizeInBits = SizeInBits / 4; + + switch (Opc) { + // Zero upper elements. + case X86ISD::VZEXT_MOVL: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + // Byte shifts by immediate. + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: + // Shift by uniform. + case X86ISD::VSHL: + case X86ISD::VSRL: + case X86ISD::VSRA: + // Shift by immediate. + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + case X86ISD::VPERMI: { + // Simplify PERMPD/PERMQ to extract_subvector. + // TODO: This should be done in shuffle combining. + if (VT == MVT::v4f64 || VT == MVT::v4i64) { + SmallVector<int, 4> Mask; + DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); + if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { + SDLoc DL(Op); + SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); + return TLO.CombineTo(Op, Insert); + } + } + break; + } + // Target Shuffles. + case X86ISD::PSHUFB: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // Saturated Packs. + case X86ISD::PACKSS: + case X86ISD::PACKUS: + // Horizontal Ops. + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + SDLoc DL(Op); + MVT ExtVT = VT.getSimpleVT(); + ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), + ExtSizeInBits / ExtVT.getScalarSizeInBits()); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue Ext1 = + extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + } } // Simplify target shuffles. @@ -32606,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue RHS = Op.getOperand(1); // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); - if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; break; } @@ -32727,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } break; } + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: { + SDValue Vec = Op.getOperand(0); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + MVT VecVT = Vec.getSimpleValueType(); + unsigned NumVecElts = VecVT.getVectorNumElements(); + + if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { + unsigned Idx = CIdx->getZExtValue(); + unsigned VecBitWidth = VecVT.getScalarSizeInBits(); + + // If we demand no bits from the vector then we must have demanded + // bits from the implict zext - simplify to zero. + APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); + if (DemandedVecBits == 0) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + + APInt KnownUndef, KnownZero; + APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, + KnownZero, TLO, Depth + 1)) + return true; + + KnownBits KnownVec; + if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + Known = KnownVec.zext(BitWidth, true); + return false; + } + break; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue Vec = Op.getOperand(0); + SDValue Scl = Op.getOperand(1); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { + unsigned Idx = CIdx->getZExtValue(); + if (!OriginalDemandedElts[Idx]) + return TLO.CombineTo(Op, Vec); + + KnownBits KnownVec; + APInt DemandedVecElts(OriginalDemandedElts); + DemandedVecElts.clearBit(Idx); + if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + KnownBits KnownScl; + unsigned NumSclBits = Scl.getScalarValueSizeInBits(); + APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); + if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) + return true; + + KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); + Known.One = KnownVec.One & KnownScl.One; + Known.Zero = KnownVec.Zero & KnownScl.Zero; + return false; + } + break; + } + case X86ISD::PACKSS: + // PACKSS saturates to MIN/MAX integer values. So if we just want the + // sign bit then we can just ask for the source operands sign bit. + // TODO - add known bits handling. + if (OriginalDemandedBits.isSignMask()) { + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); + + KnownBits KnownLHS, KnownRHS; + APInt SignMask = APInt::getSignMask(BitWidth * 2); + if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, + KnownLHS, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, + KnownRHS, TLO, Depth + 1)) + return true; + } + // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. + break; + case X86ISD::PCMPGT: + // icmp sgt(0, R) == ashr(R, BitWidth-1). + // iff we only need the sign bit then we can use R directly. + if (OriginalDemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return TLO.CombineTo(Op, Op.getOperand(1)); + break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32868,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Helper to peek through bitops/setcc to determine size of source vector. +// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. +static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return Src.getOperand(0).getValueSizeInBits() == Size; + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return checkBitcastSrcVectorSize(Src.getOperand(0), Size) && + checkBitcastSrcVectorSize(Src.getOperand(1), Size); + } + return false; +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. -static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, +static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, + const SDLoc &DL, const X86Subtarget &Subtarget) { - EVT VT = BitCast.getValueType(); - SDValue N0 = BitCast.getOperand(0); - EVT VecVT = N0->getValueType(0); - - if (!VT.isScalarInteger() || !VecVT.isSimple()) + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. - bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - (N0.getOperand(0).getValueType() == MVT::v16i8 || - N0.getOperand(0).getValueType() == MVT::v32i8 || - N0.getOperand(0).getValueType() == MVT::v64i8); + bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + (Src.getOperand(0).getValueType() == MVT::v16i8 || + Src.getOperand(0).getValueType() == MVT::v32i8 || + Src.getOperand(0).getValueType() == MVT::v64i8); // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. @@ -32908,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; - switch (VecVT.getSimpleVT().SimpleTy) { + switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: @@ -32918,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - N0->getOperand(0).getValueType().is256BitVector()) { + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) SExtVT = MVT::v4i64; - } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -32930,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (N0->getOperand(0).getValueType().is256BitVector() || - N0->getOperand(0).getValueType().is512BitVector())) { + // TODO : use checkBitcastSrcVectorSize + if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && + (Src.getOperand(0).getValueType().is256BitVector() || + Src.getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; } break; @@ -32956,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, return SDValue(); }; - SDLoc DL(BitCast); - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0); + SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v64i8) { SDValue Lo, Hi; @@ -32977,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, DAG.getUNDEF(MVT::v8i16)); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } - return DAG.getZExtOrTrunc(V, DL, VT); + + EVT IntVT = + EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); + V = DAG.getZExtOrTrunc(V, DL, IntVT); + return DAG.getBitcast(VT, V); } // Convert a vXi1 constant build vector to the same width scalar integer. @@ -33054,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, +static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDLoc DL(N); - unsigned NumElts = N.getNumOperands(); - - auto *BV = cast<BuildVectorSDNode>(N); + SDLoc DL(BV); + unsigned NumElts = BV->getNumOperands(); SDValue Splat = BV->getSplatValue(); // Build MMX element from integer GPR or SSE float values. @@ -33107,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, Ops.append(NumElts, Splat); } else { for (unsigned i = 0; i != NumElts; ++i) - Ops.push_back(CreateMMXElement(N.getOperand(i))); + Ops.push_back(CreateMMXElement(BV->getOperand(i))); } // Use tree of PUNPCKLs to build up general MMX vector. @@ -33141,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) { - if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + SDLoc dl(N); + if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); N0 = DAG.getBitcast(MVT::v8i1, N0); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, @@ -33159,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; @@ -33213,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8)) - return createMMXBuildVector(N0, DAG, Subtarget); + return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget); // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || @@ -33297,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Given a select, detect the following pattern: -// 1: %2 = zext <N x i8> %0 to <N x i32> -// 2: %3 = zext <N x i8> %1 to <N x i32> -// 3: %4 = sub nsw <N x i32> %2, %3 -// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] -// 5: %6 = sub nsw <N x i32> zeroinitializer, %4 -// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 +// Given a ABS node, detect the following pattern: +// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. -static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, - SDValue &Op1) { - // Check the condition of the select instruction is greater-than. - SDValue SetCC = Select->getOperand(0); - if (SetCC.getOpcode() != ISD::SETCC) - return false; - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT && CC != ISD::SETLT) - return false; - - SDValue SelectOp1 = Select->getOperand(1); - SDValue SelectOp2 = Select->getOperand(2); - - // The following instructions assume SelectOp1 is the subtraction operand - // and SelectOp2 is the negation operand. - // In the case of SETLT this is the other way around. - if (CC == ISD::SETLT) - std::swap(SelectOp1, SelectOp2); - - // The second operand of the select should be the negation of the first - // operand, which is implemented as 0 - SelectOp1. - if (!(SelectOp2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && - SelectOp2.getOperand(1) == SelectOp1)) - return false; - - // The first operand of SetCC is the first operand of the select, which is the - // difference between the two input vectors. - if (SetCC.getOperand(0) != SelectOp1) - return false; - - // In SetLT case, The second operand of the comparison can be either 1 or 0. - APInt SplatVal; - if ((CC == ISD::SETLT) && - !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && - SplatVal.isOneValue()) || - (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) +static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { + SDValue AbsOp1 = Abs->getOperand(0); + if (AbsOp1.getOpcode() != ISD::SUB) return false; - // In SetGT case, The second operand of the comparison can be either -1 or 0. - if ((CC == ISD::SETGT) && - !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || - ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) - return false; - - // The first operand of the select is the difference between the two input - // vectors. - if (SelectOp1.getOpcode() != ISD::SUB) - return false; - - Op0 = SelectOp1.getOperand(0); - Op1 = SelectOp1.getOperand(1); + Op0 = AbsOp1.getOperand(0); + Op1 = AbsOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || @@ -33476,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } -// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Bail without SSE2 or with AVX512VL (which uses predicate registers). - if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + // Bail without SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && - ExtractVT != MVT::i8) + ExtractVT != MVT::i8 && ExtractVT != MVT::i1) return SDValue(); - // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. ISD::NodeType BinOp; SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); + if (!Match && ExtractVT == MVT::i1) + Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); if (!Match) return SDValue(); @@ -33501,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); - // We require AVX2 for PMOVMSKB for v16i16/v32i8; - unsigned MatchSizeInBits = Match.getValueSizeInBits(); - if (!(MatchSizeInBits == 128 || - (MatchSizeInBits == 256 && - ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) - return SDValue(); + SDValue Movmsk; + SDLoc DL(Extract); + EVT MatchVT = Match.getValueType(); + unsigned NumElts = MatchVT.getVectorNumElements(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) - return SDValue(); + if (ExtractVT == MVT::i1) { + // Special case for (pre-legalization) vXi1 reductions. + if (NumElts > 32) + return SDValue(); + if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { + // If this is a legal AVX512 predicate type then we can just bitcast. + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = DAG.getBitcast(MovmskVT, Match); + } else { + // Use combineBitcastvxi1 to create the MOVMSK. + if (NumElts == 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + NumElts = 16; + } + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); + } + if (!Movmsk) + return SDValue(); + Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); + } else { + // Bail with AVX512VL (which uses predicate registers). + if (Subtarget.hasVLX()) + return SDValue(); - // Check that we are extracting a reduction of all sign bits. - if (DAG.ComputeNumSignBits(Match) != BitWidth) - return SDValue(); + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && Subtarget.hasAVX()))) + return SDValue(); - // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. - MVT MaskVT; - if (64 == BitWidth || 32 == BitWidth) - MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), - MatchSizeInBits / BitWidth); - else - MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + // Make sure this isn't a vector of 1 element. The perf win from using + // MOVMSK diminishes with less elements in the reduction, but it is + // generally better to get the comparison over to the GPRs as soon as + // possible to reduce the number of vector ops. + if (Match.getValueType().getVectorNumElements() < 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + MatchSizeInBits = Match.getValueSizeInBits(); + } + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskSrcVT; + if (64 == BitWidth || 32 == BitWidth) + MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); + Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); + NumElts = MaskSrcVT.getVectorNumElements(); + } + assert(NumElts <= 32 && "Not expecting more than 32 elements"); - APInt CompareBits; + if (BinOp == ISD::XOR) { + // parity -> (AND (CTPOP(MOVMSK X)), 1) + SDValue Mask = DAG.getConstant(1, DL, MVT::i32); + SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); + return DAG.getZExtOrTrunc(Result, DL, ExtractVT); + } + + SDValue CmpC; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 - CompareBits = APInt::getNullValue(32); + CmpC = DAG.getConstant(0, DL, MVT::i32); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) - CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); CondCode = ISD::CondCode::SETEQ; } - // Perform the select as i32/i64 and then truncate to avoid partial register - // stalls. - unsigned ResWidth = std::max(BitWidth, 32u); - EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); - SDLoc DL(Extract); - SDValue Zero = DAG.getConstant(0, DL, ResVT); - SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); - SDValue Res = DAG.getBitcast(MaskVT, Match); - Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); - Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), - Ones, Zero, CondCode); - return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + // The setcc produces an i8 of 0/1, so extend that to the result width and + // negate to get the final 0/-1 mask value. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetccVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); + SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); + SDValue Zero = DAG.getConstant(0, DL, ExtractVT); + return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, @@ -33592,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. - if (!Root || (Root.getOpcode() != ISD::VSELECT)) + if (!Root || Root.getOpcode() != ISD::ABS) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. @@ -33651,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) return SDValue(); + SDValue SrcBC = peekThroughBitcasts(Src); + // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. - if (X86ISD::VBROADCAST == Src.getOpcode() && - Src.getOperand(0).getValueType() == VT) - return Src.getOperand(0); + if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { + SDValue SrcOp = SrcBC.getOperand(0); + if (SrcOp.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, SrcOp); + } // Resolve the target shuffle inputs and mask. SmallVector<int, 16> Mask; SmallVector<SDValue, 2> Ops; - if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) + if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -33704,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[SrcIdx / Mask.size()]; - SrcOp = DAG.getBitcast(SrcVT, SrcOp); SrcIdx = SrcIdx % Mask.size(); // We can only extract other elements from 128-bit vectors and in certain @@ -33714,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { assert(SrcSVT == VT && "Unexpected extraction type"); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); } @@ -33723,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); @@ -33731,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Extracting a scalar FP value from vector element 0 is free, so extract each +/// operand first, then perform the math as a scalar op. +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); + SDValue Vec = ExtElt->getOperand(0); + SDValue Index = ExtElt->getOperand(1); + EVT VT = ExtElt->getValueType(0); + EVT VecVT = Vec.getValueType(); + + // TODO: If this is a unary/expensive/expand op, allow extraction from a + // non-zero element because the shuffle+scalar op will be cheaper? + if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) + return SDValue(); + + // Vector FP compares don't fit the pattern of FP math ops (propagate, not + // extract, the condition code), so deal with those as a special-case. + if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { + EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); + if (OpVT != MVT::f32 && OpVT != MVT::f64) + return SDValue(); + + // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(1), Index); + return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); + } + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Vector FP selects don't fit the pattern of FP math ops (because the + // condition has a different type and we have to change the opcode), so deal + // with those here. + // FIXME: This is restricted to pre type legalization by ensuring the setcc + // has i1 elements. If we loosen this we need to convert vector bool to a + // scalar bool. + if (Vec.getOpcode() == ISD::VSELECT && + Vec.getOperand(0).getOpcode() == ISD::SETCC && + Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && + Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { + // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Vec.getOperand(0).getValueType().getScalarType(), + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(1), Index); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(2), Index); + return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); + } + + // TODO: This switch could include FNEG and the x86-specific FP logic ops + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // missed load folding and fma+fneg combining. + switch (Vec.getOpcode()) { + case ISD::FMA: // Begin 3 operands + case ISD::FMAD: + case ISD::FADD: // Begin 2 operands + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCOPYSIGN: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: + case X86ISD::FMAX: + case X86ISD::FMIN: + case ISD::FABS: // Begin 1 operand + case ISD::FSQRT: + case ISD::FRINT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case X86ISD::FRCP: + case X86ISD::FRSQRT: { + // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... + SDLoc DL(ExtElt); + SmallVector<SDValue, 4> ExtOps; + for (SDValue Op : Vec->ops()) + ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); + return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); + } + default: + return SDValue(); + } + llvm_unreachable("All opcodes should return within switch"); +} + +/// Try to convert a vector reduction sequence composed of binops and shuffles +/// into horizontal ops. +static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + SDValue Index = ExtElt->getOperand(1); + if (!isNullConstant(Index)) + return SDValue(); + + // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. + ISD::NodeType Opc; + SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + if (!Rdx) + return SDValue(); + + EVT VT = ExtElt->getValueType(0); + EVT VecVT = ExtElt->getOperand(0).getValueType(); + if (VecVT.getScalarType() != VT) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + SDLoc DL(ExtElt); + + // 256-bit horizontal instructions operate on 128-bit chunks rather than + // across the whole vector, so we need an extract + hop preliminary stage. + // This is the only step where the operands of the hop are not the same value. + // TODO: We could extend this to handle 512-bit or even longer vectors. + if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || + ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { + unsigned NumElts = VecVT.getVectorNumElements(); + SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); + SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); + VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + } + if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && + !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) + return SDValue(); + + // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 + assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); + unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); + for (unsigned i = 0; i != ReductionSteps; ++i) + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -33741,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); + SDLoc dl(InputVector); + bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; + + if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + + // Integer Constant Folding. + if (CIdx && VT.isInteger()) { + APInt UndefVecElts; + SmallVector<APInt, 16> EltBits; + unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); + if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, + EltBits, true, false)) { + uint64_t Idx = CIdx->getZExtValue(); + if (UndefVecElts[Idx]) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()), + dl, VT); + } + } + // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in: // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and // combineBasicSADPattern. - if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (IsPextr) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits( + SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) + return SDValue(N, 0); return SDValue(); + } if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; - SDValue InputVector = N->getOperand(0); - SDValue EltIdx = N->getOperand(1); - - EVT SrcVT = InputVector.getValueType(); - EVT VT = N->getValueType(0); - SDLoc dl(InputVector); - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { @@ -33778,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && - isa<ConstantSDNode>(EltIdx) && - isa<ConstantSDNode>(InputVector.getOperand(0))) { - uint64_t ExtractedElt = N->getConstantOperandVal(1); - auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0)); - const APInt &InputValue = InputC->getAPIntValue(); - uint64_t Res = InputValue[ExtractedElt]; - return DAG.getConstant(Res, dl, MVT::i1); - } - // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, @@ -33802,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget)) + return V; + + if (SDValue V = scalarizeExtEltFP(N, DAG)) + return V; + + // Attempt to extract a i1 element by using MOVMSK to extract the signbits + // and then testing the relevant element. + if (CIdx && SrcVT.getScalarType() == MVT::i1) { + SmallVector<SDNode *, 16> BoolExtracts; + auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { + if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(Use->getOperand(1)) && + Use->getValueType(0) == MVT::i1) { + BoolExtracts.push_back(Use); + return true; + } + return false; + }; + if (all_of(InputVector->uses(), IsBoolExtract) && + BoolExtracts.size() > 1) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); + if (SDValue BC = + combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { + for (SDNode *Use : BoolExtracts) { + // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask + unsigned MaskIdx = Use->getConstantOperandVal(1); + APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); + SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); + SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); + Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); + DCI.CombineTo(Use, Res); + } + return SDValue(N, 0); + } + } + } + return SDValue(); } @@ -33825,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, assert(CondVT.isVector() && "Vector select expects a vector selector!"); - bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. - if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && - CondVT.getVectorElementType() == MVT::i1) { + // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? + // TODO: Can we assert that both operands are not zeros (because that should + // get simplified at node creation time)? + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && + Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 @@ -33844,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); - bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); - bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - // Try to invert the condition if true value is not all 1s and false value is - // not all 0s. - if (!TValIsAllOnes && !FValIsAllZeros && + // not all 0s. Only do this if the condition has one use. + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. @@ -33907,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// If both arms of a vector select are concatenated vectors, split the select, +/// and concatenate the result to eliminate a wide (256-bit) vector instruction: +/// vselect Cond, (concat T0, T1), (concat F0, F1) --> +/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) +static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) + return SDValue(); + + // TODO: Split 512-bit vectors too? + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + // TODO: Split as long as any 2 of the 3 operands are concatenated? + SDValue Cond = N->getOperand(0); + SDValue TVal = N->getOperand(1); + SDValue FVal = N->getOperand(2); + SmallVector<SDValue, 4> CatOpsT, CatOpsF; + if (!TVal.hasOneUse() || !FVal.hasOneUse() || + !collectConcatOps(TVal.getNode(), CatOpsT) || + !collectConcatOps(FVal.getNode(), CatOpsF)) + return SDValue(); + + auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, + makeBlend, /*CheckBWI*/ false); +} + static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); @@ -33973,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { /// If this is a *dynamic* select (non-constant condition) and we can match /// this node with one of the variable blend instructions, restructure the /// condition so that blends can use the high (sign) bit of each element. -/// This function will also call SimplfiyDemandedBits on already created +/// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -34268,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } + // AVX512 - Extend select with zero to merge with target shuffle. + // select(mask, extract_subvector(shuffle(x)), zero) --> + // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) + // TODO - support non target shuffles as well. + if (Subtarget.hasAVX512() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + auto SelectableOp = [&TLI](SDValue Op) { + return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isTargetShuffle(Op.getOperand(0).getOpcode()) && + isNullConstant(Op.getOperand(1)) && + TLI.isTypeLegal(Op.getOperand(0).getValueType()) && + Op.hasOneUse() && Op.getOperand(0).hasOneUse(); + }; + + bool SelectableLHS = SelectableOp(LHS); + bool SelectableRHS = SelectableOp(RHS); + bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { + EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() + : RHS.getOperand(0).getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts); + LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, + VT.getSizeInBits()); + RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, + VT.getSizeInBits()); + Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, + DAG.getUNDEF(SrcCondVT), Cond, + DAG.getIntPtrConstant(0, DL)); + SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); + return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); + } + } + if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; @@ -34338,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C - // TODO: Handle build_vectors with undef elements. auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { - return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); + return (!Op && !Cond) || + (Op && Cond && + Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) { - OpRHS = DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), OpRHS); + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, + /*AllowUndefs*/ true)) { + OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + OpRHS); return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } @@ -34432,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) + return V; + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); @@ -34715,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. -static SDValue combineCarryThroughADD(SDValue EFLAGS) { +static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { SDValue Carry = EFLAGS.getOperand(0); @@ -34728,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) { Carry = Carry.getOperand(0); if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { - if (Carry.getConstantOperandVal(0) == X86::COND_B) - return Carry.getOperand(1); + // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? + uint64_t CarryCC = Carry.getConstantOperandVal(0); + SDValue CarryOp1 = Carry.getOperand(1); + if (CarryCC == X86::COND_B) + return CarryOp1; + if (CarryCC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp + // instruction cannot take an immediate as its first operand. + // + if (CarryOp1.getOpcode() == X86ISD::SUB && + CarryOp1.getNode()->hasOneUse() && + CarryOp1.getValueType().isInteger() && + !isa<ConstantSDNode>(CarryOp1.getOperand(1))) { + SDValue SubCommute = + DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), + CarryOp1.getOperand(1), CarryOp1.getOperand(0)); + return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); + } + } + // If this is a check of the z flag of an add with 1, switch to the + // C flag. + if (CarryCC == X86::COND_E && + CarryOp1.getOpcode() == X86ISD::ADD && + isOneConstant(CarryOp1.getOperand(1))) + return CarryOp1; } } } @@ -34744,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) - if (SDValue Flags = combineCarryThroughADD(EFLAGS)) + if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) @@ -34763,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); + // cmov X, X, ?, ? --> X + if (TrueOp == FalseOp) + return TrueOp; + // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { @@ -35044,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. - bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); @@ -35283,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, @@ -35352,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) @@ -35489,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt Mask = N0.getConstantOperandAPInt(1); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if @@ -35638,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineShift(SDNode* N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (N->getOpcode() == ISD::SHL) - if (SDValue V = combineShiftLeft(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRA) - if (SDValue V = combineShiftRightArithmetic(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRL) - if (SDValue V = combineShiftRightLogical(N, DAG, DCI)) - return V; - - return SDValue(); -} - static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35677,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector<APInt, 32> EltBits0, EltBits1; - if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && - (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && + if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && + (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; @@ -35750,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Attempt to combine as shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35766,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); + // Detect constant shift amounts. + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) { + unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); + return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, + EltBits[0].getZExtValue(), DAG); + } + APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); @@ -35829,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -35864,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - assert( - ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && - N->getValueType(0) == MVT::v8i16)) && - "Unexpected vector insertion"); + EVT VT = N->getValueType(0); + assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && + "Unexpected vector insertion"); + + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35894,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - SDValue CMP0 = N0->getOperand(1); - SDValue CMP1 = N1->getOperand(1); + SDValue CMP0 = N0.getOperand(1); + SDValue CMP1 = N1.getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. @@ -35987,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Match (xor X, -1) -> X. +// Match extract_subvector(xor X, -1) -> extract_subvector(X). +// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). +static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + V = peekThroughBitcasts(V); + if (V.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + return V.getOperand(0); + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), + Not, V.getOperand(1)); + } + } + SmallVector<SDValue, 2> CatOps; + if (collectConcatOps(V.getNode(), CatOps)) { + for (SDValue &CatOp : CatOps) { + SDValue NotCat = IsNOT(CatOp, DAG); + if (!NotCat) return SDValue(); + CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); + } + return SDValue(); +} + /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); @@ -35996,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue X, Y; - SDValue N0 = peekThroughBitcasts(N->getOperand(0)); - SDValue N1 = peekThroughBitcasts(N->getOperand(1)); - if (N0.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) { - X = N0.getOperand(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (SDValue Not = IsNOT(N0, DAG)) { + X = Not; Y = N1; - } else if (N1.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) { - X = N1.getOperand(0); + } else if (SDValue Not = IsNOT(N1, DAG)) { + X = Not; Y = N0; } else return SDValue(); @@ -36046,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // The type of the truncated inputs. - if (N0->getOperand(0).getValueType() != VT) + if (N0.getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. @@ -36062,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. - N0 = N0->getOperand(0); + N0 = N0.getOperand(0); if (RHSTrunc) - N1 = N1->getOperand(0); + N1 = N1.getOperand(0); else N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); @@ -36088,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - unsigned FPOpcode = ISD::DELETED_NODE; - if (N->getOpcode() == ISD::AND) - FPOpcode = X86ISD::FAND; - else if (N->getOpcode() == ISD::OR) - FPOpcode = X86ISD::FOR; - else if (N->getOpcode() == ISD::XOR) - FPOpcode = X86ISD::FXOR; - - assert(FPOpcode != ISD::DELETED_NODE && - "Unexpected input node for FP logic conversion"); - EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - ((Subtarget.hasSSE1() && VT == MVT::i32) || - (Subtarget.hasSSE2() && VT == MVT::i64))) { - SDValue N00 = N0.getOperand(0); - SDValue N10 = N1.getOperand(0); - EVT N00Type = N00.getValueType(); - EVT N10Type = N10.getValueType(); - if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { - SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); - return DAG.getBitcast(VT, FPLogic); - } + + if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + + // Ensure that both types are the same and are legal scalar fp types. + if (N00Type != N10Type || + !((Subtarget.hasSSE1() && N00Type == MVT::f32) || + (Subtarget.hasSSE2() && N00Type == MVT::f64))) + return SDValue(); + + unsigned FPOpcode; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected input node for FP logic conversion"); + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; } - return SDValue(); + + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits @@ -36371,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineParity(N, DAG, Subtarget)) return V; + // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (Mask) { + APInt AllBits = APInt::getAllOnesValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ); + } + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -36392,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36440,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) +static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); + + EVT VT = N->getValueType(0); + if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) + return SDValue(); + + SDValue N0 = peekThroughBitcasts(N->getOperand(0)); + SDValue N1 = peekThroughBitcasts(N->getOperand(1)); + if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) + return SDValue(); + + // On XOP we'll lower to PCMOV so accept one use, otherwise only + // do this if either mask has multiple uses already. + if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || + !N1.getOperand(1).hasOneUse())) + return SDValue(); + + // Attempt to extract constant byte masks. + APInt UndefElts0, UndefElts1; + SmallVector<APInt, 32> EltBits0, EltBits1; + if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, + false, false)) + return SDValue(); + if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, + false, false)) + return SDValue(); + + for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { + // TODO - add UNDEF elts support. + if (UndefElts0[i] || UndefElts1[i]) + return SDValue(); + if (EltBits0[i] != ~EltBits1[i]) + return SDValue(); + } + + SDLoc DL(N); + SDValue X = N->getOperand(0); + SDValue Y = + DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), + DAG.getBitcast(VT, N1.getOperand(0))); + return DAG.getNode(ISD::OR, DL, VT, X, Y); +} + // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { if (N->getOpcode() != ISD::OR) @@ -36472,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { return true; } +// Try to match: +// (or (and (M, (sub 0, X)), (pandn M, X))) +// which is a special case of vselect: +// (vselect M, (sub 0, X), X) +// Per: +// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate +// We know that, if fNegate is 0 or 1: +// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) +// +// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: +// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) +// ( M ? -X : X) == ((X ^ M ) + (M & 1)) +// This lets us transform our vselect to: +// (add (xor X, M), (and M, 1)) +// And further to: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoConditionalNegate( + EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + EVT MaskVT = Mask.getValueType(); + assert(MaskVT.isInteger() && + DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && + "Mask must be zero/all-bits"); + + if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) + return SDValue(); + if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) + return SDValue(); + + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; + + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + else + return SDValue(); + + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); +} + // Try to fold: // (or (and (m, y), (pandn m, x))) // into: @@ -36507,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); - // Try to match: - // (or (and (M, (sub 0, X)), (pandn M, X))) - // which is a special case of vselect: - // (vselect M, (sub 0, X), X) - // Per: - // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate - // We know that, if fNegate is 0 or 1: - // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) - // - // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: - // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) - // ( M ? -X : X) == ((X ^ M ) + (M & 1)) - // This lets us transform our vselect to: - // (add (xor X, M), (and M, 1)) - // And further to: - // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && - DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { - auto IsNegV = [](SDNode *N, SDValue V) { - return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && - ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); - }; - SDValue V; - if (IsNegV(Y.getNode(), X)) - V = X; - else if (IsNegV(X.getNode(), Y)) - V = Y; - - if (V) { - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); - SDValue SubOp2 = Mask; - - // If the negate was on the false side of the select, then - // the operands of the SUB need to be swapped. PR 27251. - // This is because the pattern being matched above is - // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) - // but if the pattern matched was - // (vselect M, X, (sub (0, X))), that is really negation of the pattern - // above, -(vselect M, (sub 0, X), X), and therefore the replacement - // pattern also needs to be a negation of the replacement pattern above. - // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the - // sub accomplishes the negation of the replacement pattern. - if (V == Y) - std::swap(SubOp1, SubOp2); - - SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); - return DAG.getBitcast(VT, Res); - } - } + // Attempt to combine to conditional negate: (sub (xor X, M), M) + if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, + DAG, Subtarget)) + return Res; // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) @@ -36665,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); - EVT VT = OR->getValueType(0); - SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); @@ -36702,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; + if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) + return R; + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36718,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some @@ -36747,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SDValue ShMsk0; if (ShAmt0.getOpcode() == ISD::AND && isa<ConstantSDNode>(ShAmt0.getOperand(1)) && - ShAmt0.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk0 = ShAmt0; ShAmt0 = ShAmt0.getOperand(0); } SDValue ShMsk1; if (ShAmt1.getOpcode() == ISD::AND && isa<ConstantSDNode>(ShAmt1.getOperand(1)) && - ShAmt1.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk1 = ShAmt1; ShAmt1 = ShAmt1.getOperand(0); } @@ -36765,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); - unsigned Opc = X86ISD::SHLD; + unsigned Opc = ISD::FSHL; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB || - ShAmt0.getOpcode() == ISD::XOR) { - Opc = X86ISD::SHRD; + if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { + Opc = ISD::FSHR; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); std::swap(ShMsk0, ShMsk1); } - // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C ) + auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, + SDValue Amt) { + if (Opc == ISD::FSHR) + std::swap(Op0, Op1); + return DAG.getNode(Opc, DL, VT, Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); + }; + + // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C ) if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getOpcode() == ISD::AND && + isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) && + ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) { + ShMsk1 = ShAmt1Op1; + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + } if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if ((SumC->getAPIntValue() == Bits || (SumC->getAPIntValue() == 0 && ShMsk1)) && ShAmt1Op1 == ShAmt0) - return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) - return DAG.getNode(Opc, DL, VT, - N0.getOperand(0), N1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, - MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) { - unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); + unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) ShAmt1Op0 = ShAmt1Op0.getOperand(0); @@ -36812,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa<ConstantSDNode>(Op1.getOperand(1)) && - Op1.getConstantOperandVal(1) == 1) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + Op1.getConstantOperandAPInt(1) == 1) { + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && Op1.getOperand(0) == Op1.getOperand(1)) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } } } @@ -36862,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // Make sure the shift amount extracts the sign bit. if (!isa<ConstantSDNode>(Shift.getOperand(1)) || - Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. @@ -36915,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return SDValue(); // The shift should be smearing the sign bit across each vector element. - auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1)); - if (!ShiftBV) - return SDValue(); - - EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); - auto *ShiftAmt = ShiftBV->getConstantSplatNode(); - if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + auto *ShiftAmt = + isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); + if (!ShiftAmt || + ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious @@ -37203,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, AVGBuilder); } - if (Operands[0].getOpcode() == ISD::ADD) + // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). + // Match the or case only if its 'add-like' - can be replaced by an add. + auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { + if (ISD::ADD == V.getOpcode()) { + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + } + if (ISD::ZERO_EXTEND != V.getOpcode()) + return false; + V = V.getOperand(0); + if (V.getValueType() != VT || ISD::OR != V.getOpcode() || + !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) + return false; + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + }; + + SDValue Op0, Op1; + if (FindAddLike(Operands[0], Op0, Op1)) std::swap(Operands[0], Operands[1]); - else if (Operands[1].getOpcode() != ISD::ADD) + else if (!FindAddLike(Operands[1], Op0, Op1)) return SDValue(); - Operands[2] = Operands[1].getOperand(0); - Operands[1] = Operands[1].getOperand(1); + Operands[2] = Op0; + Operands[1] = Op1; // Now we have three operands of two additions. Check that one of them is a - // constant vector with ones, and the other two are promoted from i8/i16. + // constant vector with ones, and the other two can be promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; @@ -37219,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) - if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || - Operands[j].getOperand(0).getValueType() != VT) - return SDValue(); + if (Operands[j].getValueType() != VT) { + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + Operands[j] = Operands[j].getOperand(0); + } // The pattern is detected, emit X86ISD::AVG instruction(s). - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Operands[0].getOperand(0), - Operands[1].getOperand(0) }, AVGBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]}, + AVGBuilder); } return SDValue(); @@ -37246,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; - unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, - AddressSpace, Alignment, &Fast) && !Fast))) { + *Ld->getMemOperand(), &Fast) && + !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Ptr = Ld->getBasePtr(); - + unsigned HalfAlign = 16; + SDValue Ptr1 = Ld->getBasePtr(); + SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - NumElems/2); + NumElems / 2); SDValue Load1 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); - - Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); - SDValue Load2 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags()); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, + Ld->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Load1.getValue(1), - Load2.getValue(1)); + Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); return DCI.CombineTo(N, NewVec, TF, true); } + // Bool vector load - attempt to cast to an integer, as we have good + // (vXiY *ext(vXi1 bitcast(iX))) handling. + if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && + RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { + unsigned NumElts = RegVT.getVectorNumElements(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + if (TLI.isTypeLegal(IntVT)) { + SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Alignment, + Ld->getMemOperand()->getFlags()); + SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); + return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); + } + } + return SDValue(); } @@ -37404,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, if (ML->getPassThru().isUndef()) return SDValue(); + if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) + return SDValue(); + // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), @@ -37434,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::SEXTLOAD) + if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. @@ -37504,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); - return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -37543,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; @@ -37551,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -37561,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + return SDValue(); } // Resolve truncating stores. unsigned NumElems = VT.getVectorNumElements(); - EVT StVT = Mst->getMemoryVT(); - SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. @@ -37644,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast<StoreSDNode>(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); + unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -37699,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); - unsigned Alignment = St->getAlignment(); - SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl); @@ -37724,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; - unsigned AddressSpace = St->getAddressSpace(); - unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AddressSpace, Alignment, &Fast) && + *St->getMemOperand(), &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); - SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); + return splitVectorStore(St, DAG); + } - SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); + // Split under-aligned vector non-temporal stores. + if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { + // ZMM/YMM nt-stores - either it can be stored as a series of shorter + // vectors or the legalizer can scalarize it to use MOVNTI. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + return splitVectorStore(St, DAG); + } + + // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 + // to use MOVNTI. + if (VT.is128BitVector() && Subtarget.hasSSE2()) { + MVT NTVT = Subtarget.hasSSE4A() + ? MVT::v2f64 + : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); + return scalarizeVectorStore(St, NTVT, DAG); + } + } - SDValue Ch0 = - DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), - Alignment, St->getMemOperand()->getFlags()); - SDValue Ch1 = - DAG.getStore(St->getChain(), dl, Value1, Ptr1, - St->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + // Try to optimize v16i16->v16i8 truncating stores when BWI is not + // supported, but avx512f is by extending to v16i32 and truncating. + if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && + St->getValue().getOpcode() == ISD::TRUNCATE && + St->getValue().getOperand(0).getValueType() == MVT::v16i16 && + TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && + !DCI.isBeforeLegalizeOps()) { + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), + MVT::v16i8, St->getMemOperand()); } // Optimize trunc store (of multiple scalars) to shuffle and store. @@ -37763,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue Val = detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, TLI)) @@ -37867,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if ((VT.isVector() || + if (((VT.isVector() && !VT.isFloatingPoint()) || (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa<LoadSDNode>(St->getValue()) && !cast<LoadSDNode>(St->getValue())->isVolatile() && @@ -37890,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = (Subtarget.is64Bit() && - (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); @@ -37965,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool IsCommutative) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -37979,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. - // At least one of the operands should be a vector shuffle. - if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && - RHS.getOpcode() != ISD::VECTOR_SHUFFLE) - return false; - MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); + unsigned NumElts = VT.getVectorNumElements(); + + // TODO - can we make a general helper method that does all of this for us? + auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, + SmallVectorImpl<int> &ShuffleMask) { + if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (!Op.getOperand(0).isUndef()) + N0 = Op.getOperand(0); + if (!Op.getOperand(1).isUndef()) + N1 = Op.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); + ShuffleMask.append(Mask.begin(), Mask.end()); + return; + } + bool UseSubVector = false; + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Op.getOperand(0).getValueType().is256BitVector() && + llvm::isNullConstant(Op.getOperand(1))) { + Op = Op.getOperand(0); + UseSubVector = true; + } + bool IsUnary; + SmallVector<SDValue, 2> SrcOps; + SmallVector<int, 16> SrcShuffleMask; + SDValue BC = peekThroughBitcasts(Op); + if (isTargetShuffle(BC.getOpcode()) && + getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, + SrcOps, SrcShuffleMask, IsUnary)) { + if (!UseSubVector && SrcShuffleMask.size() == NumElts && + SrcOps.size() <= 2) { + N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); + N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); + ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); + } + if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) && + SrcOps.size() == 1) { + N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op)); + N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op)); + ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts); + ShuffleMask.append(Mask.begin(), Mask.end()); + } + } + }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. - unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; - SmallVector<int, 16> LMask(NumElts); - if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!LHS.getOperand(0).isUndef()) - A = LHS.getOperand(0); - if (!LHS.getOperand(1).isUndef()) - B = LHS.getOperand(1); - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); - llvm::copy(Mask, LMask.begin()); - } else { - A = LHS; - for (unsigned i = 0; i != NumElts; ++i) - LMask[i] = i; - } + SmallVector<int, 16> LMask; + GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 16> RMask(NumElts); - if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!RHS.getOperand(0).isUndef()) - C = RHS.getOperand(0); - if (!RHS.getOperand(1).isUndef()) - D = RHS.getOperand(1); - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); - llvm::copy(Mask, RMask.begin()); - } else { + SmallVector<int, 16> RMask; + GetShuffle(RHS, C, D, RMask); + + // At least one of the operands should be a vector shuffle. + unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); + if (NumShuffles == 0) + return false; + + if (LMask.empty()) { + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask.push_back(i); + } + + if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) - RMask[i] = i; + RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting @@ -38072,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + + if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + return false; + + LHS = DAG.getBitcast(VT, LHS); + RHS = DAG.getBitcast(VT, RHS); return true; } @@ -38088,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd) && - shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); return SDValue(); @@ -38105,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); - unsigned Opcode = Src.getOpcode(); + unsigned SrcOpcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); @@ -38123,14 +40443,17 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, return true; // See if this is a single use constant which can be constant folded. - SDValue BC = peekThroughOneUseBitcasts(Op); - return ISD::isBuildVectorOfConstantSDNodes(BC.getNode()); + // NOTE: We don't peek throught bitcasts here because there is currently + // no support for constant folding truncate+bitcast+vector_of_constants. So + // we'll just send up with a truncate on both operands which will + // get turned back into (truncate (binop)) causing an infinite loop. + return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); - return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); + return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. @@ -38145,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. - switch (Opcode) { + switch (SrcOpcode) { case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegalOrPromote(Opcode, VT) && + if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38160,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. - if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && - !TLI.isOperationLegal(Opcode, SrcVT)) + if (SrcVT.getScalarType() == MVT::i64 && + TLI.isOperationLegal(SrcOpcode, VT) && + !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38177,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // truncatable to avoid interfering with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) return TruncateArithmetic(Op0, Op1); break; @@ -38188,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, } /// Truncate using ISD::AND mask and X86ISD::PACKUS. +/// e.g. trunc <8 x i32> X to <8 x i16> --> +/// MaskX = X & 0xffff (clear high bits to prevent saturation) +/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); - EVT InSVT = InVT.getVectorElementType(); EVT OutVT = N->getValueType(0); - EVT OutSVT = OutVT.getVectorElementType(); - - // Split a long vector into vectors of legal type and mask to unset all bits - // that won't appear in the result to prevent saturation. - // TODO - we should be doing this at the maximum legal size but this is - // causing regressions where we're concatenating back to max width just to - // perform the AND and then extracting back again..... - unsigned NumSubRegs = InVT.getSizeInBits() / 128; - unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); - EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); - SmallVector<SDValue, 8> SubVecs(NumSubRegs); - - APInt Mask = - APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); - SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT); - - for (unsigned i = 0; i < NumSubRegs; i++) { - SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, - DAG.getIntPtrConstant(i * NumSubRegElts, DL)); - SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal); - } - In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs); + APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget); } @@ -38580,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); + unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); + SDValue Op = peekThroughBitcasts(SDValue(N, 0)); - auto VT = Op->getValueType(0); + EVT VT = Op->getValueType(0); + // Make sure the element size does't change. + if (VT.getScalarSizeInBits() != ScalarSize) + return SDValue(); + if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) - return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), - SVOp->getMask()); + if (NegOp0.getValueType() == VT) // FIXME: Can we do better? + return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), + SVOp->getMask()); return SDValue(); } unsigned Opc = Op.getOpcode(); @@ -38601,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (!InsVector.isUndef()) return SDValue(); if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) - return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, - NegInsVal, Op.getOperand(2)); + if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, + NegInsVal, Op.getOperand(2)); return SDValue(); } if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) return SDValue(); - SDValue Op1 = peekThroughBitcasts(Op.getOperand(1)); - if (!Op1.getValueType().isFloatingPoint()) - return SDValue(); - - SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); + SDValue Op1 = Op.getOperand(1); + SDValue Op0 = Op.getOperand(0); // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit // masks. For FSUB, we have to check if constant bits of Op0 are sign bit @@ -38625,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { SmallVector<APInt, 16> EltBits; // Extract constant bits and see if they are all sign bit masks. Ignore the // undef elements. - if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(), + if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) { @@ -38922,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (Subtarget.useSoftFloat()) return SDValue(); - // TODO: If an operand is already known to be a NaN or not a NaN, this - // should be an optional swap and FMAX/FMIN. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); - if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || - (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64) || + (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue Op0 = N->getOperand(0); @@ -38941,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + // If one of the operands is known non-NaN use the native min/max instructions + // with the non-NaN input as second operand. + if (DAG.isKnownNeverNaN(Op1)) + return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + if (DAG.isKnownNeverNaN(Op0)) + return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); + // If we have to respect NaN inputs, this takes at least 3 instructions. // Favor a library call when operating on a scalar and minimizing code size. - if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) + if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); - EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( - DAG.getDataLayout(), *DAG.getContext(), VT); + EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: @@ -38987,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, KnownZero, DCI)) return SDValue(N, 0); + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getIntegerVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + + return SDValue(); +} + +static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getFloatingPointVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + return SDValue(); } @@ -39005,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. - if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) { - return DAG.getNode(ISD::AND, SDLoc(N), VT, - N->getOperand(0).getOperand(0), N->getOperand(1)); - } + if (SDValue Not = IsNOT(N->getOperand(0), DAG)) + return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), + N->getOperand(1)); // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -39039,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + + EVT DstVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); - if (ExtraVT != MVT::i16) + if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) return SDValue(); - // Look through single use any_extends. - if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse()) + // Look through single use any_extends / truncs. + SDValue IntermediateBitwidthOp; + if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && + N0.hasOneUse()) { + IntermediateBitwidthOp = N0; N0 = N0.getOperand(0); + } // See if we have a single use cmov. if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) @@ -39066,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // If we looked through an any_extend above, add one to the constants. - if (N0.getValueType() != VT) { - CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0); - CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1); + // If we looked through an any_extend/trunc above, add one to the constants. + if (IntermediateBitwidthOp) { + unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); + CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); + CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); + } + + CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); + CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); + + EVT CMovVT = DstVT; + // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. + if (DstVT == MVT::i16) { + CMovVT = MVT::i32; + CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); + CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); } - CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1); - CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1); + SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, + N0.getOperand(2), N0.getOperand(3)); - return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1, - N0.getOperand(2), N0.getOperand(3)); + if (CMovVT != DstVT) + CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); + + return CMov; } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + if (SDValue V = combineSextInRegCmov(N, DAG)) return V; @@ -39336,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, return SDValue(); unsigned Opcode = N->getOpcode(); + // TODO - add ANY_EXTEND support. if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) @@ -39382,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { - EVT InVT = N.getValueType(); - EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), - Size / InVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), - DAG.getUNDEF(InVT)); + EVT SrcVT = N.getValueType(); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + Size / SrcVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(), + DAG.getUNDEF(SrcVT)); Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend @@ -39410,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, ExOp); } @@ -39421,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; - + unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); SmallVector<SDValue, 8> Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, @@ -39457,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Only do this combine with AVX512 for vector extends. - if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC) + if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) return SDValue(); // Only combine legal element types. @@ -39473,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since // that's the only integer compares with we have. - ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get(); + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); if (ISD::isUnsignedIntSetCC(CC)) return SDValue(); @@ -39629,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, if (!NegVal) return SDValue(); + // FIXME: Should we bitcast instead? + if (NegVal.getValueType() != VT) + return SDValue(); + unsigned NewOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -39705,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; + // TODO: Combine with any target/faux shuffle. + if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && + VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); + unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); + APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); + if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && + (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { + return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); + } + } + return SDValue(); } @@ -39734,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if (isNullConstant(Y) && !IsOrXorXorCCZero) return SDValue(); - // Bail out if we know that this is not really just an oversized integer. - if (peekThroughBitcasts(X).getValueType() == MVT::f128 || - peekThroughBitcasts(Y).getValueType() == MVT::f128) + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa<ConstantSDNode>(X) || X.getValueType().isVector() || + X.getOpcode() == ISD::LOAD; + }; + if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && + !IsOrXorXorCCZero) return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? @@ -39873,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); + unsigned NumBits = VT.getScalarSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); // Perform constant folding. if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { - assert(VT== MVT::i32 && "Unexpected result type"); + assert(VT == MVT::i32 && "Unexpected result type"); APInt Imm(32, 0); for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { - SDValue In = Src.getOperand(Idx); - if (!In.isUndef() && - cast<ConstantSDNode>(In)->getAPIntValue().isNegative()) + if (!Src.getOperand(Idx).isUndef() && + Src.getConstantOperandAPInt(Idx).isNegative()) Imm.setBit(Idx); } return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() && - SrcVT.isFloatingPoint() && - Src.getOperand(0).getValueType() == - EVT(SrcVT).changeVectorElementTypeToInteger()) - Src = Src.getOperand(0); + unsigned EltWidth = SrcVT.getScalarSizeInBits(); + if (Src.getOpcode() == ISD::BITCAST && + Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) + return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); + + // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results + // with scalar comparisons. + if (SDValue NotSrc = IsNOT(Src, DAG)) { + SDLoc DL(N); + APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); + NotSrc = DAG.getBitcast(SrcVT, NotSrc); + return DAG.getNode(ISD::XOR, DL, VT, + DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), + DAG.getConstant(NotMask, DL, VT)); + } // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + APInt DemandedMask(APInt::getAllOnesValue(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); - // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)). - // Only do this when the setcc input and output types are the same and the - // setcc and the 'and' node have a single use. - // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't. - APInt SplatVal; - if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && - Src.getOperand(0).getValueType() == Src.getValueType() && - cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE && - ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && - Src.getOperand(0).getOpcode() == ISD::AND) { - SDValue And = Src.getOperand(0); - if (And.hasOneUse() && - ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) && - SplatVal.isPowerOf2()) { - MVT VT = Src.getSimpleValueType(); - unsigned BitWidth = VT.getScalarSizeInBits(); - unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1; - SDLoc DL(And); - SDValue X = And.getOperand(0); - // If the element type is i8, we need to bitcast to i16 to use a legal - // shift. If we wait until lowering we end up with an extra and to bits - // from crossing the 8-bit elements, but we don't care about that here. - if (VT.getVectorElementType() == MVT::i8) { - VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); - X = DAG.getBitcast(VT, X); - } - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(ShAmt, DL, VT)); - SDValue Cast = DAG.getBitcast(SrcVT, Shl); - return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast); - } - } - return SDValue(); } @@ -40065,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. - if (BuildVectorSDNode *BV = - dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { + if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); @@ -40088,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, return SDValue(); } +/// If we are converting a value to floating-point, try to replace scalar +/// truncate of an extracted vector element with a bitcast. This tries to keep +/// the sequence on XMM registers rather than moving between vector and GPRs. +static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { + // TODO: This is currently only used by combineSIntToFP, but it is generalized + // to allow being called by any similar cast opcode. + // TODO: Consider merging this into lowering: vectorizeExtractedCast(). + SDValue Trunc = N->getOperand(0); + if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue ExtElt = Trunc.getOperand(0); + if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isNullConstant(ExtElt.getOperand(1))) + return SDValue(); + + EVT TruncVT = Trunc.getValueType(); + EVT SrcVT = ExtElt.getValueType(); + unsigned DestWidth = TruncVT.getSizeInBits(); + unsigned SrcWidth = SrcVT.getSizeInBits(); + if (SrcWidth % DestWidth != 0) + return SDValue(); + + // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) + EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); + unsigned VecWidth = SrcVecVT.getSizeInBits(); + unsigned NumElts = VecWidth / DestWidth; + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); + SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); + SDLoc DL(N); + SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, + BitcastVec, ExtElt.getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); +} + static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -40181,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return FILDChain; } } + + if (SDValue V = combineToFPTruncExtElt(N, DAG)) + return V; + return SDValue(); } @@ -40267,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) && onlyZeroFlagUsed(SDValue(N, 0))) { - EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); - unsigned ShAmt = Op.getConstantOperandVal(1); - if (ShAmt < BitWidth) { // Avoid undefined shifts. + const APInt &ShAmt = Op.getConstantOperandAPInt(1); + if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. + unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); APInt Mask = Op.getOpcode() == ISD::SRL - ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) - : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + ? APInt::getHighBitsSet(BitWidth, MaskBits) + : APInt::getLowBitsSet(BitWidth, MaskBits); if (Mask.isSignedIntN(32)) { Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -40283,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { } } - // Look for a truncate with a single use. if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse()) return SDValue(); @@ -40337,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { return Op.getValue(1); } +static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && + "Expected X86ISD::ADD or X86ISD::SUB"); + + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + MVT VT = LHS.getSimpleValueType(); + unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB; + + // If we don't use the flag result, simplify back to a generic ADD/SUB. + if (!N->hasAnyUseOfValue(1)) { + SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); + return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); + } + + // Fold any similar generic ADD/SUB opcodes to reuse this node. + auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + SDValue Ops[] = {N0, N1}; + SDVTList VTs = DAG.getVTList(N->getValueType(0)); + if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + SDValue Op(N, 0); + if (Negate) + Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); + DCI.CombineTo(GenericAddSub, Op); + } + }; + MatchGeneric(LHS, RHS, false); + MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + + return SDValue(); +} + static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, @@ -40346,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { Flags); } + // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) + // iff the flag result is dead. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) && + !N->hasAnyUseOfValue(1)) + return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0), + Op0.getOperand(1), N->getOperand(2)); + return SDValue(); } @@ -40372,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, Res1, CarryOut); } - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, @@ -40468,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), @@ -40575,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; auto BuildPMADDWD = [&](SDValue Mul) { @@ -40631,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, return SDValue(); // We know N is a reduction add, which means one of its operands is a phi. - // To match SAD, we need the other operand to be a vector select. - if (Op0.getOpcode() != ISD::VSELECT) + // To match SAD, we need the other operand to be a ABS. + if (Op0.getOpcode() != ISD::ABS) std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::VSELECT) + if (Op0.getOpcode() != ISD::ABS) return SDValue(); auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { @@ -40673,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, Op0 = BuildPSADBW(SadOp0, SadOp1); // It's possible we have a sad on the other side too. - if (Op1.getOpcode() == ISD::VSELECT && + if (Op1.getOpcode() == ISD::ABS && detectZextAbsDiff(Op1, SadOp0, SadOp1)) { Op1 = BuildPSADBW(SadOp0, SadOp1); } @@ -40815,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, PMADDBuilder); } -// Try to turn (add (umax X, C), -C) into (psubus X, C) -static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // psubus is available in SSE2 for i8 and i16 vectors. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || - !isPowerOf2_32(VT.getVectorNumElements()) || - !(VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16)) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - if (Op0.getOpcode() != ISD::UMAX) - return SDValue(); - - // The add should have a constant that is the negative of the max. - // TODO: Handle build_vectors with undef elements. - auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { - return Max->getAPIntValue() == (-Op->getAPIntValue()); - }; - if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT)) - return SDValue(); - - SDLoc DL(N); - return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0), - Op0.getOperand(1)); -} - // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -40957,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, ArrayRef<SDValue> Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. - EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i16 && + EVT OpVT = Ops[0].getValueType(); + assert(OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"); - assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); + assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - InVT.getVectorNumElements() / 2); + OpVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, @@ -40990,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -41003,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; - if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) - return V; - return combineAddOrSubToADCOrSBB(N, DAG); } @@ -41110,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa<ConstantSDNode>(Op1.getOperand(1))) { - APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); + const APInt &XorC = Op1.getConstantOperandAPInt(1); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), @@ -41124,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); @@ -41159,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Helper that combines an array of subvector ops as if they were the operands +/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. +/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. +static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, + ArrayRef<SDValue> Ops, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); + + if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) + return DAG.getUNDEF(VT); + + if (llvm::all_of(Ops, [](SDValue Op) { + return ISD::isBuildVectorAllZeros(Op.getNode()); + })) + return getZeroVector(VT, Subtarget, DAG, DL); + + SDValue Op0 = Ops[0]; + + // Fold subvector loads into one. + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) { + bool Fast; + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstLd->getMemOperand(), &Fast) && + Fast) { + if (SDValue Ld = + EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + return Ld; + } + } + + // Repeated subvectors. + if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) { + // If this broadcast/subv_broadcast is inserted into both halves, use a + // larger broadcast/subv_broadcast. + if (Op0.getOpcode() == X86ISD::VBROADCAST || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST) + return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); + + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) + if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && + (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, + Op0.getOperand(0), + DAG.getIntPtrConstant(0, DL))); + + // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) + if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Subtarget.hasAVX2() || + (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + Op0.getOperand(0).getValueType() == VT.getScalarType()) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); + } + + bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); + + // Repeated opcode. + // TODO - combineX86ShufflesRecursively should handle shuffle concatenation + // but it currently struggles with different vector widths. + if (llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getOpcode() == Op0.getOpcode(); + })) { + unsigned NumOps = Ops.size(); + switch (Op0.getOpcode()) { + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFD: + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && + Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(Ops[i].getOperand(0)); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), + Op0.getOperand(1)); + } + LLVM_FALLTHROUGH; + case X86ISD::VPERMILPI: + // TODO - add support for vXf64/vXi64 shuffles. + if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) && + Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0))); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src); + Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res, + Op0.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + break; + case X86ISD::PACKUS: + if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) { + SmallVector<SDValue, 2> LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); + } + break; + } + } + + // If we're inserting all zeros into the upper half, change this to + // an insert into an all zeros vector. We will match this to a move + // with implicit upper bit zeroing during isel. + if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), Ops[0], + DAG.getIntPtrConstant(0, DL)); + + return SDValue(); +} + +static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + EVT SrcVT = N->getOperand(0).getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Don't do anything for i1 vectors. + if (VT.getVectorElementType() == MVT::i1) + return SDValue(); + + if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { + SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, + DCI, Subtarget)) + return R; + } + + return SDValue(); +} + static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -41173,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); - unsigned IdxVal = N->getConstantOperandVal(2); + uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); - if (ISD::isBuildVectorAllZeros(Vec.getNode())) { - // Inserting zeros into zeros is a nop. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, dl); + if (Vec.isUndef() && SubVec.isUndef()) + return DAG.getUNDEF(OpVT); + + // Inserting undefs/zeros into zeros/undefs is a zero vector. + if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && + (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { - unsigned Idx2Val = SubVec.getConstantOperandVal(2); + uint64_t Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec.getOperand(1), @@ -41197,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && - SubVec.getConstantOperandVal(1) == 0 && + SubVec.getConstantOperandAPInt(1) == 0 && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); - if (Ins.getConstantOperandVal(2) == 0 && + if (Ins.getConstantOperandAPInt(2) == 0 && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); } - - // If we're inserting a bitcast into zeros, rewrite the insert and move the - // bitcast to the other side. This helps with detecting zero extending - // during isel. - // TODO: Is this useful for other indices than 0? - if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) { - MVT CastVT = SubVec.getOperand(0).getSimpleValueType(); - unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits(); - MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems); - SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, - DAG.getBitcast(NewVT, Vec), - SubVec.getOperand(0), N->getOperand(2)); - return DAG.getBitcast(OpVT, Insert); - } } // Stop here if this is an i1 vector. @@ -41248,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, } } - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) + // Match concat_vector style patterns. + SmallVector<SDValue, 2> SubVectorOps; + if (collectConcatOps(N, SubVectorOps)) + if (SDValue Fold = + combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) + return Fold; + + // If we are inserting into both halves of the vector, the starting vector + // should be undef. If it isn't, make it so. Only do this if the early insert + // has no other uses. + // TODO: Should this be a generic DAG combine? + // TODO: Why doesn't SimplifyDemandedVectorElts catch this? if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - if (isNullConstant(Vec.getOperand(2))) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, - Subtarget, false)) - return Ld; - } - } - // If lower/upper loads are the same and there's no other use of the lower - // load, then splat the loaded value with a broadcast. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse()) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - - // If we're inserting all zeros into the upper half, change this to - // an insert into an all zeros vector. We will match this to a move - // with implicit upper bit zeroing during isel. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, - getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2, - Vec.getOperand(2)); + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && + isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && + Vec.hasOneUse()) { + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), + Vec.getOperand(1), Vec.getOperand(2)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, + N->getOperand(2)); + } - // If we are inserting into both halves of the vector, the starting - // vector should be undef. If it isn't, make it so. Only do this if the - // the early insert has no other uses. - // TODO: Should this be a generic DAG combine? - if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) { - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), - SubVec2, Vec.getOperand(2)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, - N->getOperand(2)); + // If this is a broadcast insert into an upper undef, use a larger broadcast. + if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) + return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); - } - } + return SDValue(); +} + +/// If we are extracting a subvector of a vector select and the select condition +/// is composed of concatenated vectors, try to narrow the select width. This +/// is a common pattern for AVX1 integer code because 256-bit selects may be +/// legal, but there is almost no integer math/logic available for 256-bit. +/// This function should only be called with legal types (otherwise, the calls +/// to get simple value types will assert). +static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { + SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); + SmallVector<SDValue, 4> CatOps; + if (Sel.getOpcode() != ISD::VSELECT || + !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) + return SDValue(); + + // Note: We assume simple value types because this should only be called with + // legal operations/types. + // TODO: This can be extended to handle extraction to 256-bits. + MVT VT = Ext->getSimpleValueType(0); + if (!VT.is128BitVector()) + return SDValue(); + + MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); + if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) + return SDValue(); + + MVT WideVT = Ext->getOperand(0).getSimpleValueType(); + MVT SelVT = Sel.getSimpleValueType(); + assert((SelVT.is256BitVector() || SelVT.is512BitVector()) && + "Unexpected vector type with legal operations"); + + unsigned SelElts = SelVT.getVectorNumElements(); + unsigned CastedElts = WideVT.getVectorNumElements(); + unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue(); + if (SelElts % CastedElts == 0) { + // The select has the same or more (narrower) elements than the extract + // operand. The extraction index gets scaled by that factor. + ExtIdx *= (SelElts / CastedElts); + } else if (CastedElts % SelElts == 0) { + // The select has less (wider) elements than the extract operand. Make sure + // that the extraction index can be divided evenly. + unsigned IndexDivisor = CastedElts / SelElts; + if (ExtIdx % IndexDivisor != 0) + return SDValue(); + ExtIdx /= IndexDivisor; + } else { + llvm_unreachable("Element count of simple vector types are not divisible?"); } - return SDValue(); + unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); + unsigned NarrowElts = SelElts / NarrowingFactor; + MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); + SDLoc DL(Ext); + SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); + SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); + SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); + SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); + return DAG.getBitcast(VT, NarrowSel); } static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, @@ -41334,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Capture the original wide type in the likely case that we need to bitcast // back to this type. - EVT VT = N->getValueType(0); + if (!N->getValueType(0).isSimple()) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); EVT WideVecVT = N->getOperand(0).getValueType(); SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41360,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - MVT OpVT = N->getSimpleValueType(0); + if (SDValue V = narrowExtractedVectorSelect(N, DAG)) + return V; + SDValue InVec = N->getOperand(0); unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); + return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { - if (OpVT.getScalarType() == MVT::i1) - return DAG.getConstant(1, SDLoc(N), OpVT); - return getOnesVector(OpVT, DAG, SDLoc(N)); + if (VT.getScalarType() == MVT::i1) + return DAG.getConstant(1, SDLoc(N), VT); + return getOnesVector(VT, DAG, SDLoc(N)); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector( - OpVT, SDLoc(N), - InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements())); + VT, SDLoc(N), + InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); + + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR + if (InVec.getOpcode() == ISD::BITCAST && + InVec.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = InVec.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + SrcOp, NewIndex); + return DAG.getBitcast(VT, NewExtract); + } + } + } + } + + // If we're extracting from a broadcast then we're better off just + // broadcasting to the smaller type directly, assuming this is the only use. + // As its a broadcast we don't care about the extraction index. + if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && + InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { - return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); + } + // v2f64 CVTUDQ2PD(v4i32). + if (InOpcode == ISD::UINT_TO_FP && + InVec.getOperand(0).getValueType() == MVT::v4i32) { + return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTPS2PD(v4f32). if (InOpcode == ISD::FP_EXTEND && InVec.getOperand(0).getValueType() == MVT::v4f32) { - return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); } } - if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && - OpVT.is128BitVector() && - InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = - InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; - return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); - } - if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + if ((InOpcode == ISD::ANY_EXTEND || + InOpcode == ISD::ANY_EXTEND_VECTOR_INREG || + InOpcode == ISD::ZERO_EXTEND || + InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - OpVT.is128BitVector() && + VT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); + return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); } - if (InOpcode == ISD::BITCAST) { - // TODO - do this for target shuffles in general. - SDValue InVecBC = peekThroughOneUseBitcasts(InVec); - if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) { - SDLoc DL(N); - SDValue SubPSHUFB = - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, - extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL), - extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL)); - return DAG.getBitcast(OpVT, SubPSHUFB); - } + if (InOpcode == ISD::VSELECT && + InVec.getOperand(0).getValueType().is256BitVector() && + InVec.getOperand(1).getValueType().is256BitVector() && + InVec.getOperand(2).getValueType().is256BitVector()) { + SDLoc DL(N); + SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); + SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); + SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); + return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } } @@ -41428,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our @@ -41436,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) if (C->getAPIntValue().isOneValue()) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. @@ -41445,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) if (C->isNullValue()) - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - Src.getOperand(0), Src.getOperand(1)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), + Src.getOperand(1)); + + // Reduce v2i64 to v4i32 if we don't need the upper bits. + // TODO: Move to DAGCombine? + if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && + Src.getValueType() == MVT::i64 && Src.hasOneUse() && + Src.getOperand(0).getScalarValueSizeInBits() <= 32) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); return SDValue(); } @@ -41483,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Try to merge vector loads and extend_inreg to an extload. + if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && + In.hasOneUse()) { + auto *Ld = cast<LoadSDNode>(In); + if (!Ld->isVolatile()) { + MVT SVT = In.getSimpleValueType().getVectorElementType(); + ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, + VT.getVectorNumElements()); + if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { + SDValue Load = + DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + return Load; + } + } + } + + // Disabling for widening legalization for now. We can enable if we find a + // case that needs it. Otherwise it can be deleted when we switch to + // widening legalization. + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + // Combine (ext_invec (ext_invec X)) -> (ext_invec X) + if (In.getOpcode() == N->getOpcode() && + TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + + // Attempt to combine as a shuffle. + // TODO: SSE41 support + if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { + SDValue Op(N, 0); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -41494,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case ISD::CONCAT_VECTORS: + return combineConcatVectors(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: @@ -41506,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::CMP: return combineCMP(N, DAG); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); + case X86ISD::ADD: + case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); + case ISD::SHL: return combineShiftLeft(N, DAG); + case ISD::SRA: return combineShiftRightArithmetic(N, DAG); + case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); - case ISD::STORE: return combineStore(N, DAG, Subtarget); + case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); @@ -41535,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); - case X86ISD::CVTSI2P: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); + case X86ISD::CVTP2SI: + case X86ISD::CVTP2UI: + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI, + Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); @@ -41624,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; - // 8-bit multiply is probably not much cheaper than 32-bit multiply, and - // we have specializations to turn 32-bit multiply into LEA or other ops. + // TODO: Almost no 8-bit ops are desirable because they have no actual + // size/speed advantages vs. 32-bit ops, but they do have a major + // potential disadvantage by causing partial register stalls. + // + // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and + // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. - if (Opc == ISD::MUL && VT == MVT::i8) + if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, @@ -41642,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: + case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: @@ -41717,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { case ISD::ANY_EXTEND: break; case ISD::SHL: + case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). @@ -41889,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } +static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { + X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint) + .Case("{@cca}", X86::COND_A) + .Case("{@ccae}", X86::COND_AE) + .Case("{@ccb}", X86::COND_B) + .Case("{@ccbe}", X86::COND_BE) + .Case("{@ccc}", X86::COND_B) + .Case("{@cce}", X86::COND_E) + .Case("{@ccz}", X86::COND_E) + .Case("{@ccg}", X86::COND_G) + .Case("{@ccge}", X86::COND_GE) + .Case("{@ccl}", X86::COND_L) + .Case("{@ccle}", X86::COND_LE) + .Case("{@ccna}", X86::COND_BE) + .Case("{@ccnae}", X86::COND_B) + .Case("{@ccnb}", X86::COND_AE) + .Case("{@ccnbe}", X86::COND_A) + .Case("{@ccnc}", X86::COND_AE) + .Case("{@ccne}", X86::COND_NE) + .Case("{@ccnz}", X86::COND_NE) + .Case("{@ccng}", X86::COND_LE) + .Case("{@ccnge}", X86::COND_L) + .Case("{@ccnl}", X86::COND_GE) + .Case("{@ccnle}", X86::COND_G) + .Case("{@ccno}", X86::COND_NO) + .Case("{@ccnp}", X86::COND_P) + .Case("{@ccns}", X86::COND_NS) + .Case("{@cco}", X86::COND_O) + .Case("{@ccp}", X86::COND_P) + .Case("{@ccs}", X86::COND_S) + .Default(X86::COND_INVALID); + return Cond; +} + /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { @@ -41949,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } } - } + } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return C_Other; return TargetLowering::getConstraintType(Constraint); } @@ -42120,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const { return TargetLowering::LowerXConstraint(ConstraintVT); } +// Lower @cc targets via setcc. +SDValue X86TargetLowering::LowerAsmOutputForConstraint( + SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, + SelectionDAG &DAG) const { + X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); + if (Cond == X86::COND_INVALID) + return SDValue(); + // Check that return type is valid. + if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || + OpInfo.ConstraintVT.getSizeInBits() < 8) + report_fatal_error("Flag output operand is of invalid type"); + + // Get EFLAGS register. Only update chain when copyfrom is glued. + if (Flag.getNode()) { + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag); + Chain = Flag.getValue(1); + } else + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); + // Extract CC code. + SDValue CC = getSETCC(Cond, Flag, DL, DAG); + // Extend to 32-bits + SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); + + return Result; +} + /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, @@ -42229,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { - // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); + bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; + BooleanContent BCont = getBooleanContents(MVT::i64); + ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) + : ISD::SIGN_EXTEND; + int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() + : CST->getSExtValue(); + Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); break; } @@ -42242,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = nullptr; - int64_t Offset = 0; - - // Match either (GA), (GA+C), (GA+C1+C2), etc. - while (1) { - if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { - Offset += GA->getOffset(); - break; - } else if (Op.getOpcode() == ISD::ADD) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - Offset += C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } else if (Op.getOpcode() == ISD::SUB) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - Offset += -C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } - - // Otherwise, this isn't something we can handle, reject it. - return; - } - - const GlobalValue *GV = GA->getGlobal(); - // If we require an extra load to get this address, as in PIC mode, we - // can't accept it. - if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) - return; - - Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), - GA->getValueType(0), Offset); + if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference( + Subtarget.classifyGlobalReference(GA->getGlobal()))) + return; break; } } @@ -42307,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) { RC.hasSuperClassEq(&X86::VR512RegClass); } +/// Check if \p RC is a mask register class. +/// I.e., VK* or one of their variant. +static bool isVKClass(const TargetRegisterClass &RC) { + return RC.hasSuperClassEq(&X86::VK1RegClass) || + RC.hasSuperClassEq(&X86::VK2RegClass) || + RC.hasSuperClassEq(&X86::VK4RegClass) || + RC.hasSuperClassEq(&X86::VK8RegClass) || + RC.hasSuperClassEq(&X86::VK16RegClass) || + RC.hasSuperClassEq(&X86::VK32RegClass) || + RC.hasSuperClassEq(&X86::VK64RegClass); +} + std::pair<unsigned, const TargetRegisterClass *> X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, @@ -42317,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // GCC Constraint Letters switch (Constraint[0]) { default: break; + // 'A' means [ER]AX + [ER]DX. + case 'A': + if (Subtarget.is64Bit()) + return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); + assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && + "Expecting 64, 32 or 16 bit subtarget"); + return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); + // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { - // Only supported in AVX512 or later. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32RegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16RegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8RegClass); - case MVT::i1: + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1RegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16RegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32RegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64RegClass); - } } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. @@ -42403,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Scalar SSE types. case MVT::f32: case MVT::i32: - if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX()) + if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: @@ -42431,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); - return std::make_pair(0U, &X86::VR256RegClass); + if (Subtarget.hasAVX()) + return std::make_pair(0U, &X86::VR256RegClass); + break; case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: - return std::make_pair(0U, &X86::VR512RegClass); + if (!Subtarget.hasAVX512()) break; + if (VConstraint) + return std::make_pair(0U, &X86::VR512RegClass); + return std::make_pair(0U, &X86::VR512_0_15RegClass); } break; } @@ -42457,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. - if (Subtarget.hasAVX512()) { // Only supported in AVX512. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32WMRegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16WMRegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8WMRegClass); - case MVT::i1: + if (Subtarget.hasAVX512()) { + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1WMRegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8WMRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16WMRegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32WMRegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64WMRegClass); - } } break; } } + if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return std::make_pair(0U, &X86::GR32RegClass); + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass*> Res; @@ -42505,14 +45276,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (StringRef("{flags}").equals_lower(Constraint)) return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); - // 'A' means [ER]AX + [ER]DX. - if (Constraint == "A") { - if (Subtarget.is64Bit()) - return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); - assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && - "Expecting 64, 32 or 16 bit subtarget"); - return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - } + // dirflag -> DF + if (StringRef("{dirflag}").equals_lower(Constraint)) + return std::make_pair(X86::DF, &X86::DFCCRRegClass); + + // fpsr -> FPSW + if (StringRef("{fpsr}").equals_lower(Constraint)) + return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); + return Res; } @@ -42561,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. - switch (Res.first) { - case X86::EAX: + switch (DestReg) { + case X86::RAX: return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - case X86::EDX: + case X86::RDX: return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); - case X86::ECX: + case X86::RCX: return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); - case X86::EBX: + case X86::RBX: return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); - case X86::ESI: + case X86::RSI: return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); - case X86::EDI: + case X86::RDI: return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); - case X86::EBP: + case X86::RBP: return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); default: return std::make_pair(0, nullptr); @@ -42594,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) - Res.second = &X86::FR32RegClass; + Res.second = &X86::FR32XRegClass; else if (VT == MVT::f64 || VT == MVT::i64) - Res.second = &X86::FR64RegClass; - else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) - Res.second = &X86::VR128RegClass; - else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) - Res.second = &X86::VR256RegClass; + Res.second = &X86::FR64XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) + Res.second = &X86::VR128XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) + Res.second = &X86::VR256XRegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { @@ -42608,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.first = 0; Res.second = nullptr; } + } else if (isVKClass(*Class)) { + if (VT == MVT::i1) + Res.second = &X86::VK1RegClass; + else if (VT == MVT::i8) + Res.second = &X86::VK8RegClass; + else if (VT == MVT::i16) + Res.second = &X86::VK16RegClass; + else if (VT == MVT::i32) + Res.second = &X86::VK32RegClass; + else if (VT == MVT::i64) + Res.second = &X86::VK64RegClass; + else { + // Type mismatch and not a clobber: Return an error; + Res.first = 0; + Res.second = nullptr; + } } return Res; @@ -42660,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = - Entry->getParent()->getInfo<X86MachineFunctionInfo>(); + Entry->getParent()->getInfo<X86MachineFunctionInfo>(); AFI->setIsSplitCSR(true); } @@ -42688,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction().hasFnAttribute( - Attribute::NoUnwind) && - "Function should be nounwind in insertCopiesSplitCSR!"); + assert( + Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); @@ -42709,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const { /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. -StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { +StringRef +X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); |