diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 8127 |
1 files changed, 5954 insertions, 2173 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3a2cb6be12d2..a1fd34ea8000 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19,6 +19,7 @@ #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" @@ -49,6 +50,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" +#include "X86IntrinsicsInfo.h" #include <bitset> #include <numeric> #include <cctype> @@ -65,10 +67,23 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( cl::Hidden); static cl::opt<bool> ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(false), + "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); +static cl::opt<bool> ExperimentalVectorShuffleLegality( + "x86-experimental-vector-shuffle-legality", cl::init(false), + cl::desc("Enable experimental shuffle legality based on the experimental " + "shuffle lowering. Should only be used with the experimental " + "shuffle lowering."), + cl::Hidden); + +static cl::opt<int> ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -99,21 +114,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, - VecIdx); - - return Result; - + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } + /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { @@ -150,25 +162,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, - VecIdx); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } + /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes +/// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG,SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } @@ -191,28 +201,10 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - if (TT.isOSBinFormatMachO()) { - if (TT.getArch() == Triple::x86_64) - return new X86_64MachoTargetObjectFile(); - return new TargetLoweringObjectFileMachO(); - } - - if (TT.isOSLinux()) - return new X86LinuxTargetObjectFile(); - if (TT.isOSBinFormatELF()) - return new TargetLoweringObjectFileELF(); - if (TT.isKnownWindowsMSVCEnvironment()) - return new X86WindowsTargetObjectFile(); - if (TT.isOSBinFormatCOFF()) - return new TargetLoweringObjectFileCOFF(); - llvm_unreachable("unknown subtarget type"); -} - // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) + : TargetLowering(TM) { Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -240,13 +232,13 @@ void X86TargetLowering::resetOperationActions() { // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; - // X86 is weird, it always uses i8 for shift amounts and setcc results. + // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - // For 64-bit since we have so many registers use the ILP scheduler, for - // 32-bit code use the register pressure specific scheduling. + // For 64-bit, since we have so many registers, use the ILP scheduler. + // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); @@ -255,13 +247,14 @@ void X86TargetLowering::resetOperationActions() { else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + TM.getSubtarget<X86Subtarget>().getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2 - if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { - addBypassSlowDiv(32, 8); - if (Subtarget->is64Bit()) + // Bypass expensive divides on Atom when compiling with O2. + if (TM.getOptLevel() >= CodeGenOpt::Default) { + if (Subtarget->hasSlowDivide32()) + addBypassSlowDiv(32, 8); + if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } @@ -306,7 +299,8 @@ void X86TargetLowering::resetOperationActions() { if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); @@ -316,6 +310,8 @@ void X86TargetLowering::resetOperationActions() { setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); @@ -529,7 +525,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); @@ -659,8 +657,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -808,13 +805,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); + setOperationAction(ISD::FMINNUM, MVT::f80, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; + for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); @@ -883,18 +880,19 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction(VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(InnerVT, VT, Expand); + + setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); - // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, - // we have to deal with them whether we ask for Expansion or not. Setting - // Expand causes its own optimisation problems though, so leave them legal. - if (VT.getVectorElementType() == MVT::i1) - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like + // types, we have to deal with them whether we ask for Expansion or not. + // Setting Expand causes its own optimisation problems though, so leave + // them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -951,12 +949,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); - // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); @@ -997,6 +996,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each vector + // element. If popcnt isn't supported, always provide the custom version. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1011,6 +1018,22 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); @@ -1043,8 +1066,6 @@ void X86TargetLowering::resetOperationActions() { AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); @@ -1064,7 +1085,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -1106,7 +1128,15 @@ void X86TargetLowering::resetOperationActions() { // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - // i8 and i16 vectors are custom , because the source register and source + // SSE41 brings specific instructions for doing vector sign extend even in + // cases where we don't have SRA. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. @@ -1120,7 +1150,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - // FIXME: these should be Legal but thats only for the case where + // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); @@ -1200,7 +1230,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1270,6 +1301,20 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting + // when we have a 256bit-wide blend with immediate. + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each + // vector element. If popcnt isn't supported, always provide the custom + // version. + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + + // Custom CTPOP always performs better on natively supported v8i32 + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1298,15 +1343,16 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) + if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; @@ -1351,12 +1397,14 @@ void X86TargetLowering::resetOperationActions() { addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); @@ -1394,6 +1442,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); @@ -1466,16 +1518,13 @@ void X86TargetLowering::resetOperationActions() { } // Custom lower several nodes. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) + if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); @@ -1491,12 +1540,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to promote non-256-bit vectors + // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; @@ -1505,13 +1556,59 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 + if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + setOperationAction(ISD::LOAD, MVT::v32i16, Legal); + setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); + setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, Legal); + setOperationAction(ISD::ADD, MVT::v64i8, Legal); + setOperationAction(ISD::SUB, MVT::v32i16, Legal); + setOperationAction(ISD::SUB, MVT::v64i8, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, Legal); + + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + const MVT VT = (MVT::SimpleValueType)i; + + const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Do not attempt to promote non-512-bit vectors. + if (!VT.is512BitVector()) + continue; + + if (EltSize < 32) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + } + } + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + + setOperationAction(ISD::AND, MVT::v8i32, Legal); + setOperationAction(ISD::OR, MVT::v8i32, Legal); + setOperationAction(ISD::XOR, MVT::v8i32, Legal); + setOperationAction(ISD::AND, MVT::v4i32, Legal); + setOperationAction(ISD::OR, MVT::v4i32, Legal); + setOperationAction(ISD::XOR, MVT::v4i32, Legal); + } + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. - for (int VT = MVT::FIRST_VECTOR_VALUETYPE; - VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Custom); - } + for (MVT VT : MVT::vector_valuetypes()) + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -1537,9 +1634,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UMULO, VT, Custom); } - // There are no 8-bit 3-address imul/mul instructions - setOperationAction(ISD::SMULO, MVT::i8, Expand); - setOperationAction(ISD::UMULO, MVT::i8, Expand); if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. @@ -1553,9 +1647,8 @@ void X86TargetLowering::resetOperationActions() { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { - // For MacOSX, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. + // For MacOSX, we don't want the normal expansion of a libcall to sincos. + // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } @@ -1614,8 +1707,15 @@ void X86TargetLowering::resetOperationActions() { // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); - + EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. + + verifyIntrinsicTables(); +} + +// This has so far only been implemented for 64-bit MachO. +bool X86TargetLowering::useLoadStackGuardNode() const { + return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction @@ -1632,16 +1732,46 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - if (Subtarget->hasAVX512()) - switch(VT.getVectorNumElements()) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; + const unsigned NumElts = VT.getVectorNumElements(); + const EVT EltVT = VT.getVectorElementType(); + if (VT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } + + if (VT.is256BitVector() || VT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } } return VT.changeVectorElementTypeToInteger(); } -/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) @@ -1666,7 +1796,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { } } -/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. @@ -1685,7 +1815,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { return Align; } -/// getOptimalMemOpType - Returns the target specific optimal type for load +/// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it @@ -1742,15 +1872,16 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { } bool -X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; } -/// getJumpTableEncoding - Return the entry encoding for a jump table in the +/// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { @@ -1776,8 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, MCSymbolRefExpr::VK_GOTOFF, Ctx); } -/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC -/// jumptable. +/// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) @@ -1787,9 +1917,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, return Table; } -/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the -/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an -/// MCExpr. +/// This returns the relocation base for the given PIC jumptable, +/// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { @@ -1810,9 +1939,7 @@ X86TargetLowering::findRepresentativeClass(MVT VT) const{ default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = Subtarget->is64Bit() ? - (const TargetRegisterClass*)&X86::GR64RegClass : - (const TargetRegisterClass*)&X86::GR32RegClass; + RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; @@ -1867,8 +1994,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1887,8 +2013,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; @@ -1905,7 +2030,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); - // Promote values to the appropriate types + // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) @@ -1916,7 +2041,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && - "Unexpected FP-extend for return value."); + "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. @@ -1934,8 +2059,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. - if (VA.getLocReg() == X86::ST0 || - VA.getLocReg() == X86::ST1) { + if (VA.getLocReg() == X86::FP0 || + VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) @@ -2021,6 +2146,13 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; + // If we are returning more than one value, we can definitely + // not make a tail call see PR19530 + if (UI->getNumOperands() > 4) + return false; + if (UI->getNumOperands() == 4 && + UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) + return false; HasRet = true; } @@ -2031,8 +2163,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -MVT -X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? @@ -2041,11 +2173,11 @@ X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, else ReturnMVT = MVT::i32; - MVT MinVT = getRegisterType(ReturnMVT); + EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } -/// LowerCallResult - Lower the result values of a call into the +/// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue @@ -2058,8 +2190,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -2073,33 +2205,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, report_fatal_error("SSE register return with SSE disabled"); } - SDValue Val; - - // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the value is popped from the stack, so - // a CopyFromReg is not good enough - the copy instruction may be eliminated - // if the return value is not used. We use the FpPOP_RETVAL instruction - // instead. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { - // If we prefer to use the value in xmm registers, copy it out as f80 and - // use a truncate to move it from fp stack reg to xmm reg. - if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; - SDValue Ops[] = { Chain, InFlag }; - Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, - MVT::Other, MVT::Glue, Ops), 1); - Val = Chain.getValue(0); - - // Round the f80 to the right size, which also moves it to the appropriate - // xmm register. - if (CopyVT != VA.getValVT()) - Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, - // This truncation won't change the value. - DAG.getIntPtrConstant(1)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - CopyVT, InFlag).getValue(1); - Val = Chain.getValue(0); - } + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) + CopyVT = MVT::f80; + + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + SDValue Val = Chain.getValue(0); + + if (CopyVT != VA.getValVT()) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + InFlag = Chain.getValue(2); InVals.push_back(Val); } @@ -2137,8 +2257,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { return StackStructReturn; } -/// ArgsAreStructReturn - Determines whether a function uses struct -/// return semantics. +/// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { if (Ins.empty()) @@ -2152,10 +2271,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { return StackStructReturn; } -/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified -/// by "Src" to address "Dst" with size and alignment information specified by -/// the specific parameter attribute. The copy will be passed as a byval -/// function parameter. +/// Make a copy of an aggregate at address specified by "Src" to address +/// "Dst" with size and alignment information specified by the specific +/// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, @@ -2167,7 +2285,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// IsTailCallConvention - Return true if the calling convention is one that +/// Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || @@ -2192,7 +2310,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { return true; } -/// FuncIsMadeTailCallSafe - Return true if the function is being made into +/// Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt) { @@ -2240,6 +2358,55 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, } } +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + + if (Subtarget->isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); +} + +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + if (Subtarget->isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + // TODO: __vectorcall will change this. + return None; + } + + const Function *Fn = MF.getFunction(); + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return None; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); +} + SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -2267,8 +2434,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2312,6 +2478,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2378,125 +2548,146 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. - if (isVarArg) { - if (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { - FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (MFI->hasVAStart() && + (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(1, StackSize, true)); + } + + // Figure out if XMM registers are in use. + assert(!(MF.getTarget().Options.UseSoftFloat && + Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) && + "SSE register cannot be used when SSE is disabled!"); + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue. + if (Is64Bit && isVarArg && MFI->hasVAStart()) { + // Find the first unallocated argument registers. + ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); + unsigned NumIntRegs = + CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); + unsigned NumXMMRegs = + CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + // Gather all the live in physical registers. + SmallVector<SDValue, 6> LiveGPRs; + SmallVector<SDValue, 8> LiveXMMRegs; + SDValue ALVal; + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back( + DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); + } + if (!ArgXMMs.empty()) { + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); + for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { + unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); + } } - if (Is64Bit) { - unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; - - // FIXME: We should really autogenerate these arrays - static const MCPhysReg GPR64ArgRegsWin64[] = { - X86::RCX, X86::RDX, X86::R8, X86::R9 - }; - static const MCPhysReg GPR64ArgRegs64Bit[] = { - X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 - }; - static const MCPhysReg XMMArgRegs64Bit[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - const MCPhysReg *GPR64ArgRegs; - unsigned NumXMMRegs = 0; - - if (IsWin64) { - // The XMM registers which might contain var arg parameters are shadowed - // in their paired GPR. So we only need to save the GPR to their home - // slots. - TotalNumIntRegs = 4; - GPR64ArgRegs = GPR64ArgRegsWin64; - } else { - TotalNumIntRegs = 6; TotalNumXMMRegs = 8; - GPR64ArgRegs = GPR64ArgRegs64Bit; - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, - TotalNumXMMRegs); - } - unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, - TotalNumIntRegs); - - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && - NoImplicitFloatOps) && - "SSE register cannot be used when SSE is disabled!"); - if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasSSE1()) - // Kernel mode asks for SSE to be disabled, so don't push them - // on the stack. - TotalNumXMMRegs = 0; - - if (IsWin64) { - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( + if (IsWin64) { + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by deferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex( - MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, - false)); - } - - // Store the integer parameter registers. - SmallVector<SDValue, 8> MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset)); - unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); - MemOps.push_back(Store); - Offset += 8; - } - - if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector<SDValue, 11> SaveXMMOps; - SaveXMMOps.push_back(Chain); - - unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); - SaveXMMOps.push_back(ALVal); - - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex())); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset())); - - for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { - unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - &X86::VR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); - SaveXMMOps.push_back(Val); - } - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 12> SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex())); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset())); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } + + if (isVarArg && MFI->hasMustTailInVarArgFunc()) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget->hasAVX512() && + (Is64Bit || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget->hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget->hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector<MVT, 2> RegParmTypes; + MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Conservatively forward AL on x86_64, since it might be used for varargs. + if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &F : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); + Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } @@ -2544,7 +2735,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, false, false, 0); } -/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call +/// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, @@ -2561,7 +2752,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call +/// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, @@ -2599,6 +2790,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; @@ -2630,8 +2822,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2652,7 +2843,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. - X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; @@ -2671,8 +2861,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; - assert(ArgLocs.back().getLocMemOffset() == 0 && - "an inalloca argument must be the only memory argument"); + if (!ArgLocs.back().isMemLoc()) + report_fatal_error("cannot use inalloca attribute on a register " + "parameter"); + if (ArgLocs.back().getLocMemOffset() != 0) + report_fatal_error("any parameter with the inalloca attribute must be " + "the only memory argument"); } if (!IsSibcall) @@ -2691,8 +2885,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2791,7 +2985,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (Is64Bit && isVarArg && !IsWin64) { + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -2813,6 +3007,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } + if (isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } + } + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. @@ -2889,10 +3091,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. + GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. @@ -2962,6 +3165,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); + } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { + // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI + Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. @@ -2988,7 +3194,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3079,9 +3285,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3194,8 +3400,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3223,8 +3429,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3244,12 +3450,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } @@ -3258,13 +3464,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -3290,8 +3496,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3308,7 +3514,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3336,7 +3542,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = - (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3378,6 +3584,8 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::BLENDI: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: @@ -3395,7 +3603,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; @@ -3421,7 +3629,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3433,6 +3641,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGNR: + case X86ISD::VALIGN: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -3459,8 +3668,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3500,7 +3709,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. - if (M == CodeModel::Kernel && Offset > 0) + if (M == CodeModel::Kernel && Offset >= 0) return true; return false; @@ -3510,23 +3719,18 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, /// own arguments. Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { - if (IsVarArg) - return false; - switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: - return !is64Bit; case CallingConv::X86_FastCall: - return !is64Bit; case CallingConv::X86_ThisCall: return !is64Bit; case CallingConv::Fast: - return TailCallOpt; case CallingConv::GHC: - return TailCallOpt; case CallingConv::HiPE: + if (IsVarArg) + return false; return TailCallOpt; } } @@ -3667,6 +3871,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF + // relocation target a movq or addq instruction: don't let the load shrink. + SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); + if (BasePtr.getOpcode() == X86ISD::WrapperRIP) + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) + return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + return true; +} + /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -3679,6 +3895,24 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Speculate cttz only if we can directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Speculate ctlz only if we can directly use LZCNT. + return Subtarget->hasLZCNT(); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -3693,7 +3927,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified -/// sequential range (L, L+Pos]. or is undef. +/// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) @@ -3703,14 +3937,23 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, } /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference -/// the second operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { - if (VT == MVT::v4f32 || VT == MVT::v4i32 ) - return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return (Mask[0] < 2 && Mask[1] < 2); - return false; +/// is suitable for input to PSHUFD. That is, it doesn't reference the other +/// operand - by default will match for first operand. +static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT, + bool TestSecondOperand = false) { + if (VT != MVT::v4f32 && VT != MVT::v4i32 && + VT != MVT::v2f64 && VT != MVT::v2i64) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + unsigned Lo = TestSecondOperand ? NumElems : 0; + unsigned Hi = Lo + NumElems; + + for (unsigned i = 0; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) + return false; + + return true; } /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that @@ -3771,16 +4014,12 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { return true; } -/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || - (VT.is256BitVector() && !Subtarget->hasInt256())) - return false; - +/// \brief Return true if the mask specifies a shuffle of elements that is +/// suitable for input to intralane (palignr) or interlane (valign) vector +/// right-shift. +static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; + unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3844,6 +4083,29 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, return true; } +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to PALIGNR. +static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || + (VT.is256BitVector() && !Subtarget->hasInt256()) || + VT.is512BitVector()) + // FIXME: Add AVX512BW. + return false; + + return isAlignrMask(Mask, VT, false); +} + +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to VALIGN. +static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + // FIXME: Add AVX512VL. + if (!VT.is512BitVector() || !Subtarget->hasAVX512()) + return false; + return isAlignrMask(Mask, VT, true); +} + /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, @@ -4086,43 +4348,34 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) return false; } } } + return true; } @@ -4133,39 +4386,29 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) return false; } } @@ -4668,11 +4911,13 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { return Mask; } -/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with +/// VALIGN (if Interlane is true) instructions. +static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, + bool InterLane) { MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = VT.is512BitVector() ? 1 : + unsigned EltSize = InterLane ? 1 : VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); @@ -4693,6 +4938,19 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR instruction. +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, false); +} + +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the VALIGN instruction. +static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, true); +} + + static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4891,32 +5149,32 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); - SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Cst = DAG.getConstant(0, MVT::i1); SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else @@ -4933,7 +5191,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); - SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Cst = DAG.getConstant(~0U, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 @@ -5103,37 +5361,49 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. -/// Sets IsUnary to true if only uses one source. +/// target specific opcode. Returns true if the Mask could be calculated. Sets +/// IsUnary to true if only uses one source. Note that this will set IsUnary for +/// shuffles which use a single input multiple times, and in those cases it will +/// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; + bool IsFakeUnary = false; switch(N->getOpcode()) { + case X86ISD::BLENDI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; @@ -5148,6 +5418,66 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT VT = MaskNode.getValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector<uint64_t, 32> RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + RawMask.push_back((uint64_t)SM_SentinelUndef); + continue; + } + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodePSHUFBMask(C, Mask); + break; + } + + return false; + } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); @@ -5169,17 +5499,29 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; break; + case X86ISD::MOVSLDUP: + DecodeMOVSLDUPMask(VT, Mask); + break; + case X86ISD::MOVSHDUP: + DecodeMOVSHDUPMask(VT, Mask); + break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); } + // If we have a fake unary shuffle, the shuffle mask is spread across two + // inputs that are actually the same node. Re-map the mask to always point + // into the first input. + if (IsFakeUnary) + for (int &M : Mask) + if (M >= (int)Mask.size()) + M -= Mask.size(); + return true; } @@ -5470,76 +5812,112 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. -static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, - unsigned NonZeros, unsigned NumNonZero, - unsigned NumZero, SelectionDAG &DAG, +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { - // We know there's at least one non-zero element - unsigned FirstNonZeroIdx = 0; - SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); - while (FirstNonZero.getOpcode() == ISD::UNDEF || - X86::isZeroNode(FirstNonZero)) { - ++FirstNonZeroIdx; - FirstNonZero = Op->getOperand(FirstNonZeroIdx); + // Find all zeroable elements. + bool Zeroable[4]; + for (int i=0; i < 4; ++i) { + SDValue Elt = Op->getOperand(i); + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + } + assert(std::count_if(&Zeroable[0], &Zeroable[4], + [](bool M) { return !M; }) > 1 && + "We expect at least two non-zero elements!"); + + // We only know how to deal with build_vector nodes where elements are either + // zeroable or extract_vector_elt with constant index. + SDValue FirstNonZero; + unsigned FirstNonZeroIdx; + for (unsigned i=0; i < 4; ++i) { + if (Zeroable[i]) + continue; + SDValue Elt = Op->getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Elt.getOperand(1))) + return SDValue(); + // Make sure that this node is extracting from a 128-bit vector. + MVT VT = Elt.getOperand(0).getSimpleValueType(); + if (!VT.is128BitVector()) + return SDValue(); + if (!FirstNonZero.getNode()) { + FirstNonZero = Elt; + FirstNonZeroIdx = i; + } } - if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) - return SDValue(); + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); + SDValue V1 = FirstNonZero.getOperand(0); + MVT VT = V1.getSimpleValueType(); - SDValue V = FirstNonZero.getOperand(0); - MVT VVT = V.getSimpleValueType(); - if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) - return SDValue(); + // See if this build_vector can be lowered as a blend with zero. + SDValue Elt; + unsigned EltMaskIdx, EltIdx; + int Mask[4]; + for (EltIdx = 0; EltIdx < 4; ++EltIdx) { + if (Zeroable[EltIdx]) { + // The zero vector will be on the right hand side. + Mask[EltIdx] = EltIdx+4; + continue; + } - unsigned FirstNonZeroDst = - cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); - unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; - unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; - unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + Elt = Op->getOperand(EltIdx); + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. + EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) + break; + Mask[EltIdx] = EltIdx; + } - for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { - SDValue Elem = Op.getOperand(Idx); - if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) - continue; + if (EltIdx == 4) { + // Let the shuffle legalizer deal with blend operations. + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + if (V1.getSimpleValueType() != VT) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + } - // TODO: What else can be here? Deal with it. - if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + // See if we can lower this build_vector to a INSERTPS. + if (!Subtarget->hasSSE41()) + return SDValue(); - // TODO: Some optimizations are still possible here - // ex: Getting one element from a vector, and the rest from another. - if (Elem.getOperand(0) != V) - return SDValue(); + SDValue V2 = Elt.getOperand(0); + if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) + V1 = SDValue(); - unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); - if (Dst == Idx) - ++CorrectIdx; - else if (IncorrectIdx == -1U) { - IncorrectIdx = Idx; - IncorrectDst = Dst; - } else - // There was already one element with an incorrect index. - // We can't optimize this case to an insertps. - return SDValue(); + bool CanFold = true; + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { + if (Zeroable[i]) + continue; + + SDValue Current = Op->getOperand(i); + SDValue SrcVector = Current->getOperand(0); + if (!V1.getNode()) + V1 = SrcVector; + CanFold = SrcVector == V1 && + cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; } - if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { - SDLoc dl(Op); - EVT VT = Op.getSimpleValueType(); - unsigned ElementMoveMask = 0; - if (IncorrectIdx == -1U) - ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; - else - ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + if (!CanFold) + return SDValue(); - SDValue InsertpsMask = - DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); - } + assert(V1.getNode() && "Expected at least two non-zero elements!"); + if (V1.getSimpleValueType() != MVT::v4f32) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + if (V2.getSimpleValueType() != MVT::v4f32) + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); - return SDValue(); + // Ok, we can emit an INSERTPS instruction. + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (Zeroable[i]) + ZMask |= 1 << i; + + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask)); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } /// getVShift - Return a vector logical shift node. @@ -5685,15 +6063,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue NewLd = SDValue(); - if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + LDBase->getPointerInfo(), LDBase->isVolatile(), + LDBase->isNonTemporal(), LDBase->isInvariant(), + LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, @@ -5706,7 +6079,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return NewLd; } - if (NumElems == 4 && LastLoadedElt == 1 && + + //TODO: The code below fires only for for loading the low v2i32 / v2f32 + //of a v4i32 / v4f32. It's probably worth generalizing. + if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -5742,7 +6118,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -5819,17 +6198,34 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } } + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5850,7 +6246,6 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && @@ -5861,7 +6256,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -6017,8 +6413,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { AllContants = false; NonConstIdx = idx; NumNonConsts++; - } - else { + } else { NumConsts++; if (cast<ConstantSDNode>(In)->getZExtValue()) Immediate |= (1ULL << idx); @@ -6041,7 +6436,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT::getIntegerVT(VT.getSizeInBits())); DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); } - else + else DstVec = DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(NonConstIdx), @@ -6064,7 +6459,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. -/// +/// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal @@ -6085,7 +6480,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); - + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; @@ -6154,13 +6549,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by -/// a concat_vector. +/// a concat_vector. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two -/// horizontal binary operations. +/// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// @@ -6235,11 +6630,6 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); - // Don't try to emit a VSELECT that cannot be lowered into a blend. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) - return SDValue(); - // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -6251,14 +6641,14 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, for (unsigned i = 0, e = NumElts; i != e; i++) { SDValue Op = BV->getOperand(i); - + // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } - + // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); @@ -6312,34 +6702,11 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, std::swap(ExpectedOpcode, NextExpectedOpcode); } - // Don't try to fold this build_vector into a VSELECT if it has - // too many UNDEF operands. + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && - InVec1.getOpcode() != ISD::UNDEF) { - // Emit a sequence of vector add and sub followed by a VSELECT. - // The new VSELECT will be lowered into a BLENDI. - // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' - // and emit a single ADDSUB instruction. - SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); - SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); - - // Construct the VSELECT mask. - EVT MaskVT = VT.changeVectorElementTypeToInteger(); - EVT SVT = MaskVT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - SmallVector<SDValue, 8> Ops; + InVec1.getOpcode() != ISD::UNDEF) + return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); - for (unsigned i = 0, e = NumElts; i != e; ++i) { - APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : - APInt::getAllOnesValue(SVTBits); - SDValue Constant = DAG.getConstant(Value, SVT); - Ops.push_back(Constant); - } - - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); - return DAG.getSelect(DL, VT, Mask, Sub, Add); - } - return SDValue(); } @@ -6382,18 +6749,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } - + if (!Subtarget->hasAVX()) return SDValue(); @@ -6444,7 +6811,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); - + // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) @@ -6575,6 +6942,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of @@ -6648,6 +7022,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; @@ -6677,13 +7055,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); - // For AVX-length vectors, build the individual 128-bit pieces and use + // For AVX-length vectors, see if we can use a vector load to get all of the + // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); + // Check for a build vector of consecutive loads. + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) + return LD; + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. @@ -6725,8 +7108,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, - NumZero, DAG, Subtarget, *this); + SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); if (V.getNode()) return V; } @@ -6917,6 +7299,89 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { return true; } +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + +// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC +// 2013 will allow us to use it as a non-type template parameter. +namespace { + +/// \brief Implementation of the \c isShuffleEquivalent variadic functor. +/// +/// See its documentation for details. +bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { + if (Mask.size() != Args.size()) + return false; + for (int i = 0, e = Mask.size(); i < e; ++i) { + assert(*Args[i] >= 0 && "Arguments must be positive integers!"); + if (Mask[i] != -1 && Mask[i] != *Args[i]) + return false; + } + return true; +} + +} // namespace + +/// \brief Checks whether a shuffle mask is equivalent to an explicit list of +/// arguments. +/// +/// This is a fast way to test a shuffle mask against a fixed pattern: +/// +/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } +/// +/// It returns true if the mask is exactly as wide as the argument list, and +/// each element of the mask is either -1 (signifying undef) or the value given +/// in the argument. +static const VariadicFunction1< + bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -6941,6 +7406,835 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, return DAG.getConstant(Imm, MVT::i8); } +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is in fact a blend. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] >= Size) { + if (Mask[i] != i + Size) + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; + } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! + } + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v32i8: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SDValue VSELECTMask[32]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } +} + +/// \brief Generic routine to lower a shuffle and blend as a decomposed set of +/// unblended shuffles followed by an unshuffled blend. +/// +/// This matches the extremely common pattern for handling combined +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend +/// operations. +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, + SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + SmallVector<int, 32> BlendMask(Mask.size(), -1); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] < Size) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= Size) { + V2Mask[i] = Mask[i] - Size; + BlendMask[i] = i + Size; + } + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + // We need to detect various ways of spelling a rotation: + // [11, 12, 13, 14, 15, 0, 1, 2] + // [-1, 12, 13, 14, -1, -1, 1, -1] + // [-1, -1, -1, -1, -1, -1, 1, 2] + // [ 3, 4, 5, 6, 7, 8, 9, 10] + // [-1, 4, 5, 6, -1, -1, 9, -1] + // [-1, 4, 5, 6, -1, -1, -1, -1] + int Rotation = 0; + SDValue Lo, Hi; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); + + // Based on the mod-Size value of this mask element determine where + // a rotated vector would have started. + int StartIdx = i - (Mask[i] % Size); + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); + + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; + + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[i] < Size ? V1 : V2; + + // Compute which of the two target values this index should be assigned to. + // This reflects whether the high elements are remaining or the low elements + // are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } + + // Check that we successfully analyzed the mask, and normalize the results. + assert(Rotation != 0 && "Failed to locate a viable rotation!"); + assert((Lo || Hi) && "Failed to find a rotated input vector!"); + if (!Lo) + Lo = Hi; + else if (!Hi) + Hi = Lo; + + assert(VT.getSizeInBits() == 128 && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + + // The actual rotate instruction rotates bytes, so we need to scale the + // rotation based on how many bytes are in the vector. + int Scale = 16 / Mask.size(); + + // SSSE3 targets can use the palignr instruction + if (Subtarget->hasSSSE3()) { + // Cast the inputs to v16i8 to match PALIGNR. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, + DAG.getConstant(Rotation * Scale, MVT::i8))); + } + + // Default SSE2 implementation + int LoByteShift = 16 - Rotation * Scale; + int HiByteShift = Rotation * Scale; + + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); + + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + DAG.getConstant(8 * LoByteShift, MVT::i8)); + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + DAG.getConstant(8 * HiByteShift, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); +} + +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node, dig out the input value and + // use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// byte-shift instructions. The mask must consist of a shifted sequential +/// shuffle from one of the input vectors and zeroable elements for the +/// remaining 'shifted in' elements. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Size = Mask.size(); + int Scale = 16 / Size; + + for (int Shift = 1; Shift < Size; Shift++) { + int ByteShift = Shift * Scale; + + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + bool ZeroableRight = true; + for (int i = Size - Shift; i < Size; i++) { + ZeroableRight &= Zeroable[i]; + } + + if (ZeroableRight) { + bool ValidShiftRight1 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); + bool ValidShiftRight2 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); + + if (ValidShiftRight1 || ValidShiftRight2) { + // Cast the inputs to v2i64 to match PSRLDQ. + SDValue &TargetV = ValidShiftRight1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + bool ZeroableLeft = true; + for (int i = 0; i < Shift; i++) { + ZeroableLeft &= Zeroable[i]; + } + + if (ZeroableLeft) { + bool ValidShiftLeft1 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); + bool ValidShiftLeft2 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); + + if (ValidShiftLeft1 || ValidShiftLeft2) { + // Cast the inputs to v2i64 to match PSLLDQ. + SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + } + + return SDValue(); +} + +/// \brief Lower a vector shuffle as a zero or any extension. +/// +/// Given a specific number of elements, element bit width, and extension +/// stride, produce either a zero or any extension based on the available +/// features of the subtarget. +static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( + SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(Scale > 1 && "Need a scale to extend."); + int EltBits = VT.getSizeInBits() / NumElements; + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Only 8, 16, and 32 bit elements can be extended."); + assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + + // Found a valid zext mask! Try various lowering strategies based on the + // input type and available ISA extensions. + if (Subtarget->hasSSE41()) { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), + NumElements / Scale); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + } + + // For any extends we can cheat for larger element sizes and use shuffle + // instructions that can fold with a load and/or copy. + if (AnyExt && EltBits == 32) { + int PSHUFDMask[4] = {0, -1, 1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + if (AnyExt && EltBits == 16 && Scale > 2) { + int PSHUFDMask[4] = {0, -1, 0, -1}; + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); + int PSHUFHWMask[4] = {1, -1, -1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); + } + + // If this would require more than 2 unpack instructions to expand, use + // pshufb when available. We can only use more than 2 unpack instructions + // when zero extending i8 elements which also makes it easier to use pshufb. + if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + assert(NumElements == 16 && "Unexpected byte vector width!"); + SDValue PSHUFBMask[16]; + for (int i = 0; i < 16; ++i) + PSHUFBMask[i] = + DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); + InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); + } + + // Otherwise emit a sequence of unpacks. + do { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) + : getZeroVector(InputVT, Subtarget, DAG, DL); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + Scale /= 2; + EltBits *= 2; + NumElements /= 2; + } while (Scale > 1); + return DAG.getNode(ISD::BITCAST, DL, VT, InputV); +} + +/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. +/// +/// This routine will try to do everything in its power to cleverly lower +/// a shuffle which happens to match the pattern of a zero extend. It doesn't +/// check for the profitability of this lowering, it tries to aggressively +/// match this pattern. It will use all of the micro-architectural details it +/// can to emit an efficient lowering. It handles both blends with all-zero +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to +/// masking out later). +/// +/// The reason we have dedicated lowering for zext-style shuffles is that they +/// are both incredibly common and often quite performance sensitive. +static SDValue lowerVectorShuffleAsZeroOrAnyExtend( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Bits = VT.getSizeInBits(); + int NumElements = Mask.size(); + + // Define a helper function to check a particular ext-scale and lower to it if + // valid. + auto Lower = [&](int Scale) -> SDValue { + SDValue InputV; + bool AnyExt = true; + for (int i = 0; i < NumElements; ++i) { + if (Mask[i] == -1) + continue; // Valid anywhere but doesn't tell us anything. + if (i % Scale != 0) { + // Each of the extend elements needs to be zeroable. + if (!Zeroable[i]) + return SDValue(); + + // We no lorger are in the anyext case. + AnyExt = false; + continue; + } + + // Each of the base elements needs to be consecutive indices into the + // same input vector. + SDValue V = Mask[i] < NumElements ? V1 : V2; + if (!InputV) + InputV = V; + else if (InputV != V) + return SDValue(); // Flip-flopping inputs. + + if (Mask[i] % NumElements != i / Scale) + return SDValue(); // Non-consecutive strided elemenst. + } + + // If we fail to find an input, we have a zero-shuffle which should always + // have already been handled. + // FIXME: Maybe handle this here in case during blending we end up with one? + if (!InputV) + return SDValue(); + + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( + DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); + }; + + // The widest scale possible for extending is to a 64-bit integer. + assert(Bits % 64 == 0 && + "The number of bits in a vector must be divisible by 64 on x86!"); + int NumExtElements = Bits / 64; + + // Each iteration, try extending the elements half as much, but into twice as + // many elements. + for (; NumExtElements < NumElements; NumExtElements *= 2) { + assert(NumElements % NumExtElements == 0 && + "The input vector size must be divisble by the extended size."); + if (SDValue V = Lower(NumElements / NumExtElements)) + return V; + } + + // No viable ext lowering found. + return SDValue(); +} + +/// \brief Try to get a scalar value for a specific element of a vector. +/// +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. +static SDValue getScalarValueForVectorElement(SDValue V, int Idx, + SelectionDAG &DAG) { + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + // If the bitcasts shift the element size, we can't extract an equivalent + // element from it. + MVT NewVT = V.getSimpleValueType(); + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + if (V.getOpcode() == ISD::BUILD_VECTOR || + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); + + return SDValue(); +} + +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + +/// \brief Try to lower insertion of a single element into a zero vector. +/// +/// This is a common pattern that we have especially efficient patterns to lower +/// across all subtarget feature sets. +static SDValue lowerVectorShuffleAsElementInsertion( + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + bool IsV1Zeroable = true; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; + } + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if (SDValue V2S = getScalarValueForVectorElement( + V2, Mask[V2Index] - Mask.size(), DAG)) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. + return SDValue(); + } + + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); + + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); + } + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); + if (ExtVT != VT) + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { + SmallVector<int, 4> V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant( + V2Index * EltVT.getSizeInBits(), + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + } + } + return V2; +} + +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + V = V.getOperand(BroadcastIdx); + + // If the scalar isn't a load we can't broadcast from it in AVX1, only with + // AVX2. + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + return SDValue(); + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { + // We can't broadcast from a vector register w/o AVX2, and we can only + // broadcast from the zero-element of a vector register. + return SDValue(); + } + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + +// Check for whether we can use INSERTPS to perform the shuffle. We only use +// INSERTPS when the V1 elements are already in the correct locations +// because otherwise we can just always use two SHUFPS instructions which +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also +// perform INSERTPS if a single V1 element is out of place and all V2 +// elements are zeroable. +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + unsigned ZMask = 0; + int V1DstIndex = -1; + int V2DstIndex = -1; + bool V1UsedInPlace = false; + + for (int i = 0; i < 4; i++) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } + + // Flag if we use any V1 inputs in place. + if (i == Mask[i]) { + V1UsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (V1DstIndex != -1 || V2DstIndex != -1) + return SDValue(); + + if (Mask[i] < 4) { + // V1 input out of place for insertion. + V1DstIndex = i; + } else { + // V2 input for insertion. + V2DstIndex = i; + } + } + + // Don't bother if we have no (non-zeroable) element for insertion. + if (V1DstIndex == -1 && V2DstIndex == -1) + return SDValue(); + + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned V2SrcIndex = 0; + if (V1DstIndex != -1) { + // If we have a V1 input out of place, we use V1 as the V2 element insertion + // and don't use the original V2 at all. + V2SrcIndex = Mask[V1DstIndex]; + V2DstIndex = V1DstIndex; + V2 = V1; + } else { + V2SrcIndex = Mask[V2DstIndex] - 4; + } + + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!V1UsedInPlace) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + SDLoc DL(Op); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -6963,12 +8257,56 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + + // If we have a single input, insert that into V1 if we can do so cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Try to use one of the special instruction patterns to handle two common + // blend patterns if a zero-blend above didn't work. + if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) + // We can either use a special instruction to load over the low double or + // to move just the low double. + return DAG.getNode( + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, + DL, MVT::v2f64, V2, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -6992,6 +8330,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. @@ -7005,6 +8348,44 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + + // If we have a single input from V2 insert that into V1 if we can do so + // cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7015,38 +8396,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// \brief Lower 4-lane 32-bit floating point shuffles. +/// \brief Lower a vector shuffle using the SHUFPS instruction. /// -/// Uses instructions exclusively from the floating point unit to minimize -/// domain crossing penalties, as these are sufficient to implement all v4f32 -/// shuffles. -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. +/// It makes no assumptions about whether this is the *best* lowering, it simply +/// uses it. +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) - // Straight shuffle of a single input vector. We pass the input vector to - // both operands to simulate this with a SHUFPS. - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); - if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; @@ -7063,7 +8431,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; - V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary @@ -7080,9 +8448,17 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the - // high lanes. We never see this reversed because we sort the shuffle. + // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; + } else if (Mask[2] < 4 && Mask[3] < 4) { + // We also handle the reversed case because this utility may get called + // when we detect a SHUFPS pattern but can't easily commute the shuffle to + // arrange things in the right direction. + NewMask[0] -= 4; + NewMask[1] -= 4; + HighV = V1; + LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final @@ -7094,7 +8470,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; - V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to @@ -7106,10 +8482,78 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, NewMask[3] = Mask[2] < 4 ? 3 : 1; } } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Otherwise, use a straight shuffle of a single input vector. We pass the + // input vector to both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (Subtarget->hasSSE41()) { + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use INSERTPS if we can complete the shuffle efficiently. + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + return V; + } + + // Otherwise fall back to a SHUFPS lowering strategy. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7125,11 +8569,66 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (isSingleInputShuffleMask(Mask)) + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + Mask = UnpackHiMask; + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build @@ -7182,6 +8681,27 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, + Mask, Subtarget, DAG)) + return Broadcast; + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); + if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) + return Rotate; + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -7190,22 +8710,126 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // - // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 - // and 2-2. - auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput, - int ThreeInputHalfSum, int OneInputHalfOffset) { + // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half + // and an existing 2-into-2 on the other half. In this case we may have to + // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or + // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. + // Fortunately, we don't have to handle anything but a 2-into-2 pattern + // because any other situation (including a 3-into-1 or 1-into-3 in the other + // half than the one we target for fixing) will be fixed when we re-enter this + // path. We will also combine away any sequence of PSHUFD instructions that + // result into a single instruction. Here is an example of the tricky case: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] + // + // This now has a 1-into-3 in the high half! Instead, we do two shuffles: + // + // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] + // + // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] + // + // The result is fine to be handled by the generic logic. + auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, + ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, + int AOffset, int BOffset) { + assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && + "Must call this with A having 3 or 1 inputs from the A half."); + assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && + "Must call this with B having 1 or 3 inputs from the B half."); + assert(AToAInputs.size() + BToAInputs.size() == 4 && + "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int DWordA = (ThreeInputHalfSum - - std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / - 2; - int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + int ADWord, BDWord; + int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; + int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; + int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; + ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; + int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); + int TripleNonInputIdx = + TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); + TripleDWord = TripleNonInputIdx / 2; + + // We use xor with one to compute the adjacent DWord to whichever one the + // OneInput is in. + OneInputDWord = (OneInput / 2) ^ 1; + + // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA + // and BToA inputs. If there is also such a problem with the BToB and AToB + // inputs, we don't try to fix it necessarily -- we'll recurse and see it in + // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it + // is essential that we don't *create* a 3<-1 as then we might oscillate. + if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { + // Compute how many inputs will be flipped by swapping these DWords. We + // need + // to balance this to ensure we don't form a 3-1 shuffle in the other + // half. + int NumFlippedAToBInputs = + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); + int NumFlippedBToBInputs = + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); + if ((NumFlippedAToBInputs == 1 && + (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || + (NumFlippedBToBInputs == 1 && + (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { + // We choose whether to fix the A half or B half based on whether that + // half has zero flipped inputs. At zero, we may not be able to fix it + // with that half. We also bias towards fixing the B half because that + // will more commonly be the high half, and we have to bias one way. + auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, + ArrayRef<int> Inputs) { + int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. + bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), + PinnedIdx ^ 1) != Inputs.end(); + // Determine whether the free index is in the flipped dword or the + // unflipped dword based on where the pinned index is. We use this bit + // in an xor to conditionally select the adjacent dword. + int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); + bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + if (IsFixIdxInput == IsFixFreeIdxInput) + FixFreeIdx += 1; + IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + assert(IsFixIdxInput != IsFixFreeIdxInput && + "We need to be changing the number of flipped inputs!"); + int PSHUFHalfMask[] = {0, 1, 2, 3}; + std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); + V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, + MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); + + for (int &M : Mask) + if (M != -1 && M == FixIdx) + M = FixFreeIdx; + else if (M != -1 && M == FixFreeIdx) + M = FixIdx; + }; + if (NumFlippedBToBInputs != 0) { + int BPinnedIdx = + BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); + } else { + assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); + int APinnedIdx = + AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); + } + } + } int PSHUFDMask[] = {0, 1, 2, 3}; - PSHUFDMask[DWordA] = DWordB; - PSHUFDMask[DWordB] = DWordA; + PSHUFDMask[ADWord] = BDWord; + PSHUFDMask[BDWord] = ADWord; V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), @@ -7213,24 +8837,20 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Adjust the mask to match the new locations of A and B. for (int &M : Mask) - if (M != -1 && M/2 == DWordA) - M = 2 * DWordB + M % 2; - else if (M != -1 && M/2 == DWordB) - M = 2 * DWordA + M % 2; + if (M != -1 && M/2 == ADWord) + M = 2 * BDWord + M % 2; + else if (M != -1 && M/2 == BDWord) + M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), Mask); }; - if (NumLToL == 3 && NumHToL == 1) - return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); - else if (NumLToL == 1 && NumHToL == 3) - return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 1 && NumHToH == 3) - return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 3 && NumHToH == 1) - return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) + return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); + else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and @@ -7244,9 +8864,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. - auto fixInPlaceInputs = [&PSHUFDMask]( - ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask, - MutableArrayRef<int> HalfMask, int HalfOffset) { + auto fixInPlaceInputs = + [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, + MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { @@ -7255,6 +8876,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = @@ -7266,10 +8895,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; - if (!HToLInputs.empty()) - fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); - if (!LToHInputs.empty()) - fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. @@ -7278,7 +8905,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, - int SourceOffset, int DestOffset) { + MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, + int DestOffset) { auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; @@ -7304,7 +8932,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle( Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) - if (M == SourceHalfMask[Input - SourceOffset]) + if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; @@ -7356,18 +8984,68 @@ static SDValue lowerV8I16SingleInputVectorShuffle( } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { - int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; - assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && - "Not all dwords can be clobbered!"); - SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; - SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + // We have two non-adjacent or clobbered inputs we need to extract from + // the source half. To do this, we need to map them into some adjacent + // dword slot in the source mask. + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, + IncomingInputs[1] - SourceOffset}; + + // If there is a free slot in the source half mask adjacent to one of + // the inputs, place the other input in it. We use (Index XOR 1) to + // compute an adjacent index. + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && + SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + InputsFixed[1] = InputsFixed[0] ^ 1; + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && + SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; + InputsFixed[0] = InputsFixed[1] ^ 1; + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + // The two inputs are in the same DWord but it is clobbered and the + // adjacent DWord isn't used at all. Move both inputs to the free + // slot. + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; + } else { + // The only way we hit this point is if there is no clobbering + // (because there are no off-half inputs to this half) and there is no + // free slot adjacent to one of the inputs. In this case, we have to + // swap an input with a non-input. + for (int i = 0; i < 4; ++i) + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + "We can't handle any clobbers here!"); + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && + "Cannot have adjacent inputs here!"); + + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; + + // We also have to update the final source mask in this case because + // it may need to undo the above swap. + for (int &M : FinalSourceHalfMask) + if (M == (InputsFixed[0] ^ 1) + SourceOffset) + M = InputsFixed[1] + SourceOffset; + else if (M == InputsFixed[1] + SourceOffset) + M = (InputsFixed[0] ^ 1) + SourceOffset; + + InputsFixed[1] = InputsFixed[0] ^ 1; + } + + // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) - M = SourceDWordBase + SourceOffset; + M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) - M = SourceDWordBase + 1 + SourceOffset; - IncomingInputs[0] = SourceDWordBase + SourceOffset; - IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + M = InputsFixed[1] + SourceOffset; + + IncomingInputs[0] = InputsFixed[0] + SourceOffset; + IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); @@ -7377,13 +9055,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; - for (int Input : IncomingInputs) - std::replace(HalfMask.begin(), HalfMask.end(), Input, - FreeDWord * 2 + Input % 2); + for (int &M : HalfMask) + for (int Input : IncomingInputs) + if (M == Input) + M = FreeDWord * 2 + Input % 2; }; - moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); - moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their @@ -7520,34 +9199,37 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. - MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = - Mask[GoodInputs[0]] - MaskOffset; - MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = - Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; + MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; } else { - // Otherwise pin the low inputs. + // Otherwise pin the good inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } - int MoveMaskIdx = - std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - - std::begin(MoveMask); - assert(MoveMaskIdx >= MoveOffset && "Established above"); - if (BadInputs.size() == 2) { + // If we have two bad inputs then there may be either one or two good + // inputs fixed in place. Find a fixed input, and then find the *other* + // two adjacent indices by using modular arithmetic. + int GoodMaskIdx = + std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), + [](int M) { return M >= 0; }) - + std::begin(MoveMask); + int MoveMaskIdx = + ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = - Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = - Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); + int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, + std::end(MoveMask), -1) - + std::begin(MoveMask); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } @@ -7603,6 +9285,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; @@ -7615,6 +9303,33 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); + if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -7658,6 +9373,74 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); } +/// \brief Check whether a compaction lowering can be done by dropping even +/// elements and compute how many times even elements must be dropped. +/// +/// This handles shuffles which take every Nth element where N is a power of +/// two. Example shuffle masks: +/// +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 +/// +/// Any of these lanes can of course be undef. +/// +/// This routine only supports N <= 3. +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here +/// for larger N. +/// +/// \returns N above, or the number of times even elements must be dropped if +/// there is such a number. Otherwise returns zero. +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { + // Figure out whether we're looping over two inputs or just one. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // The modulus for the shuffle vector entries is based on whether this is + // a single input or not. + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); + assert(isPowerOf2_32((uint32_t)ShuffleModulus) && + "We should only be called with masks with a power-of-2 size!"); + + uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, + // and 2^3 simultaneously. This is because we may have ambiguity with + // partially undef inputs. + bool ViableForN[3] = {true, true, true}; + + for (int i = 0, e = Mask.size(); i < e; ++i) { + // Ignore undef lanes, we'll optimistically collapse them to the pattern we + // want. + if (Mask[i] == -1) + continue; + + bool IsAnyViable = false; + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) { + uint64_t N = j + 1; + + // The shuffle mask must be equal to (i * 2^N) % M. + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + IsAnyViable = true; + else + ViableForN[j] = false; + } + // Early exit if we exhaust the possible powers of two. + if (!IsAnyViable) + break; + } + + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) + return j + 1; + + // Return 0 as there is no viable power of two. + return 0; +} + /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -7675,6 +9458,22 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return Rotate; + + // Try to use a zext lowering. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], @@ -7684,8 +9483,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MutableArrayRef<int> LoMask = Mask.slice(0, 8); MutableArrayRef<int> HiMask = Mask.slice(8, 8); + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + // For single-input shuffles, there are some nicer lowering tricks we can use. - if (isSingleInputShuffleMask(Mask)) { + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -7695,10 +9502,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef<int> Mask) { - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != Mask[i + 1]) + for (int i = 0; i < 16; i += 2) + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; - } + return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { @@ -7759,11 +9566,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != -1) - PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); - assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); - } + for (int i = 0; i < 16; ++i) + if (Mask[i] != -1) { + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); + if (PostDupI16Shuffle[i / 2] == -1) + PostDupI16Shuffle[i / 2] = MappedMask; + else + assert(PostDupI16Shuffle[i / 2] == MappedMask && + "Conflicting entrties in the original shuffle!"); + } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, @@ -7780,21 +9592,127 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. - - int EMask[16], OMask[16]; + int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 0 && M < 8) || (M >= 16 && M < 24); + }); + int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 8 && M < 16) || M >= 24; + }); + int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + bool UnpackLo = NumLoHalf >= NumHiHalf; + MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8); + MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8); for (int i = 0; i < 8; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 8] = -1; - OMask[i + 8] = -1; + TargetEMask[i] = Mask[2 * i]; + TargetOMask[i] = Mask[2 * i + 1]; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, Evens, Odds); + } + + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + SDValue V1Mask[16]; + SDValue V2Mask[16]; + bool V1InUse = false; + bool V2InUse = false; + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); + } else { + const int ZeroMask = 0x80; + int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask); + int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16); + if (Zeroable[i]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + + // If we need shuffled inputs from both, blend the two. + if (V1InUse && V2InUse) + return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + if (V1InUse) + return V1; // Single inputs are easy. + if (V2InUse) + return V2; // Single inputs are easy. + // Shuffling to a zeroable vector. + return getZeroVector(MVT::v16i8, Subtarget, DAG, DL); + } + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Check whether a compaction lowering can be done. This handles shuffles + // which take every Nth element for some even N. See the helper function for + // details. + // + // We special case these as they can be particularly efficiently handled with + // the PACKUSB instruction on x86 and they show up in common patterns of + // rearranging bytes to truncate wide elements. + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { + // NumEvenDrops is the power of two stride of the elements. Another way of + // thinking about it is that we need to drop the even elements this many + // times to get the original input. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // First we need to zero all the dropped bytes. + assert(NumEvenDrops <= 3 && + "No support for dropping even elements more than 3 times."); + // We use the mask type to pick which bytes are preserved based on how many + // elements are dropped. + MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; + SDValue ByteClearMask = + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, + DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); + V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); + if (!IsSingleInput) + V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); + + // Now pack things back together. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); + for (int i = 1; i < NumEvenDrops; ++i) { + Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); + } + + return Result; } int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; @@ -7893,15 +9811,1109 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Tiny helper function to test whether adjacent masks are sequential. -static bool areAdjacentMasksSequential(ArrayRef<int> Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if (Mask[i] + 1 != Mask[i+1]) +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef<int> Mask, + SmallVectorImpl<int> &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); return true; } +/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// +/// This routine just extracts two subvectors, shuffles them independently, and +/// then concatenates them back together. This should work effectively with all +/// AVX vector shuffle types. +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() >= 256 && + "Only for 256-bit or wider vector shuffles!"); + assert(V1.getSimpleValueType() == VT && "Bad operand type!"); + assert(V2.getSimpleValueType() == VT && "Bad operand type!"); + + ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); + ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); + + int NumElements = VT.getVectorNumElements(); + int SplitNumElements = NumElements / 2; + MVT ScalarVT = VT.getScalarType(); + MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); + + SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(SplitNumElements)); + SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(0)); + SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(SplitNumElements)); + + // Now create two 4-way blends of these half-width vectors. + auto HalfBlend = [&](ArrayRef<int> HalfMask) { + bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; + SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; + for (int i = 0; i < SplitNumElements; ++i) { + int M = HalfMask[i]; + if (M >= NumElements) { + if (M >= NumElements + SplitNumElements) + UseHiV2 = true; + else + UseLoV2 = true; + V2BlendMask.push_back(M - NumElements); + V1BlendMask.push_back(-1); + BlendMask.push_back(SplitNumElements + i); + } else if (M >= 0) { + if (M >= SplitNumElements) + UseHiV1 = true; + else + UseLoV1 = true; + V2BlendMask.push_back(-1); + V1BlendMask.push_back(M); + BlendMask.push_back(i); + } else { + V2BlendMask.push_back(-1); + V1BlendMask.push_back(-1); + BlendMask.push_back(-1); + } + } + + // Because the lowering happens after all combining takes place, we need to + // manually combine these blend masks as much as possible so that we create + // a minimal number of high-level vector shuffle nodes. + + // First try just blending the halves of V1 or V2. + if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) + return DAG.getUNDEF(SplitVT); + if (!UseLoV2 && !UseHiV2) + return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + if (!UseLoV1 && !UseHiV1) + return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + + SDValue V1Blend, V2Blend; + if (UseLoV1 && UseHiV1) { + V1Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + } else { + // We only use half of V1 so map the usage down into the final blend mask. + V1Blend = UseLoV1 ? LoV1 : HiV1; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) + BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); + } + if (UseLoV2 && UseHiV2) { + V2Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + } else { + // We only use half of V2 so map the usage down into the final blend mask. + V2Blend = UseLoV2 ? LoV2 : HiV2; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= SplitNumElements) + BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); + } + return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); + }; + SDValue Lo = HalfBlend(LoMask); + SDValue Hi = HalfBlend(HiMask); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); +} + +/// \brief Either split a vector in halves or decompose the shuffles and the +/// blend. +/// +/// This is provided as a good fallback for many lowerings of non-single-input +/// shuffles with more than one 128-bit lane. In those cases, we want to select +/// between splitting the shuffle into 128-bit components and stitching those +/// back together vs. extracting the single-input shuffles and blending those +/// results. +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " + "lower single-input shuffles as it " + "could then recurse on itself."); + int Size = Mask.size(); + + // If this can be modeled as a broadcast of two elements followed by a blend, + // prefer that lowering. This is especially important because broadcasts can + // often fold with memory operands. + auto DoBothBroadcast = [&] { + int V1BroadcastIdx = -1, V2BroadcastIdx = -1; + for (int M : Mask) + if (M >= Size) { + if (V2BroadcastIdx == -1) + V2BroadcastIdx = M - Size; + else if (M - Size != V2BroadcastIdx) + return false; + } else if (M >= 0) { + if (V1BroadcastIdx == -1) + V1BroadcastIdx = M; + else if (M != V1BroadcastIdx) + return false; + } + return true; + }; + if (DoBothBroadcast()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + DAG); + + // If the inputs all stem from a single 128-bit lane of each input, then we + // split them rather than blending because the split will decompose to + // unusually few instructions. + int LaneCount = VT.getSizeInBits() / 128; + int LaneSize = Size / LaneCount; + SmallBitVector LaneInputs[2]; + LaneInputs[0].resize(LaneCount, false); + LaneInputs[1].resize(LaneCount, false); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + // Otherwise, just fall back to decomposed shuffles and a blend. This requires + // that the decomposed single-input shuffles don't end up here. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as +/// a permutation and blend of those lanes. +/// +/// This essentially blends the out-of-lane inputs to each lane into the lane +/// from a permuted copy of the vector. This lowering strategy results in four +/// instructions in the worst case for a single-input cross lane shuffle which +/// is lower than any other fully general cross-lane shuffle strategy I'm aware +/// of. Special cases for each particular shuffle pattern should be handled +/// prior to trying this lowering. +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // FIXME: This should probably be generalized for 512-bit vectors as well. + assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + int LaneSize = Mask.size() / 2; + + // If there are only inputs from one 128-bit lane, splitting will in fact be + // less expensive. The flags track wether the given lane contains an element + // that crosses to another lane. + bool LaneCrossing[2] = {false, false}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SmallVector<int, 32> FlippedBlendMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + FlippedBlendMask.push_back( + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size)); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + } + + // This now reduces to two single-input shuffles of V1 and V2 which at worst + // will be handled by the above logic and a blend of the results, much like + // other patterns in AVX. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || + isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(2)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + // Otherwise form a 128-bit permutation. + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, MVT::i8)); +} + +/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then +/// shuffling each lane. +/// +/// This will only succeed when the result of fixing the 128-bit lanes results +/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in +/// each 128-bit lanes. This handles many cases where we can quickly blend away +/// the lane crosses early and then use simpler shuffles within each lane. +/// +/// FIXME: It might be worthwhile at some point to support this without +/// requiring the 128-bit lane-relative shuffles to be repeating, but currently +/// in x86 only floating point has interesting non-repeating shuffles, and even +/// those are still *marginally* more expensive. +static SDValue lowerVectorShuffleByMerging128BitLanes( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && + "This is only useful with multiple inputs."); + + int Size = Mask.size(); + int LaneSize = 128 / VT.getScalarSizeInBits(); + int NumLanes = Size / LaneSize; + assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); + + // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also + // check whether the in-128-bit lane shuffles share a repeating pattern. + SmallVector<int, 4> Lanes; + Lanes.resize(NumLanes, -1); + SmallVector<int, 4> InLaneMask; + InLaneMask.resize(LaneSize, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int j = i / LaneSize; + + if (Lanes[j] < 0) { + // First entry we've seen for this lane. + Lanes[j] = Mask[i] / LaneSize; + } else if (Lanes[j] != Mask[i] / LaneSize) { + // This doesn't match the lane selected previously! + return SDValue(); + } + + // Check that within each lane we have a consistent shuffle mask. + int k = i % LaneSize; + if (InLaneMask[k] < 0) { + InLaneMask[k] = Mask[i] % LaneSize; + } else if (InLaneMask[k] != Mask[i] % LaneSize) { + // This doesn't fit a repeating in-lane mask. + return SDValue(); + } + } + + // First shuffle the lanes into place. + MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, + VT.getSizeInBits() / 64); + SmallVector<int, 8> LaneMask; + LaneMask.resize(NumLanes * 2, -1); + for (int i = 0; i < NumLanes; ++i) + if (Lanes[i] >= 0) { + LaneMask[2 * i + 0] = 2*Lanes[i] + 0; + LaneMask[2 * i + 1] = 2*Lanes[i] + 1; + } + + V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); + SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); + + // Cast it back to the type we actually want. + LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); + + // Now do a simple shuffle that isn't lane crossing. + SmallVector<int, 8> NewMask; + NewMask.resize(Size, -1); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; + assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && + "Must not introduce lane crosses at this point!"); + + return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); +} + +/// \brief Test whether the specified input (0 or 1) is in-place blended by the +/// given mask. +/// +/// This returns true if the elements from a particular input are already in the +/// slot required by the given mask and require no permutation. +static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) + return false; + + return true; +} + +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles. +/// +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, MVT::i8)); + } + + // With AVX2 we have direct support for this permutation. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, + DAG); + } + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check if the blend happens to exactly fit that of SHUFPD. + if ((Mask[0] == -1 || Mask[0] < 2) && + (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && + (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && + (Mask[3] == -1 || Mask[3] >= 6)) { + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && + (Mask[1] == -1 || Mask[1] < 2) && + (Mask[2] == -1 || Mask[2] >= 6) && + (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because an v4 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v4i64 shuffling.. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector<int, 2> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && + "Repeated masks must be half the mask width!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we + // have already handled any direct blends. We also need to squash the + // repeated mask into a simulated v4f32 mask. + for (int i = 0; i < 4; ++i) + if (RepeatedMask[i] >= 8) + RepeatedMask[i] -= 4; + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + } + + // If we have a single input shuffle with different shuffle patterns in the + // two 128-bit lanes use the variable mask to VPERMILPS. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v8i32, VPermMask)), + V1); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because at v8 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v8i32 shuffling.. +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + } + + // If the shuffle patterns aren't repeated but it is a single input, directly + // generate a cross-lane VPERMD instruction. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 16-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v16i16 shuffling.. +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } + + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i16, + DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 8-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v32i8 shuffling.. +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + // Note that these are repeated 128-bit lane unpacks, not unpacks across all + // 256-bit lanes. + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 32; ++i) + PSHUFBMask[i] = + Mask[i] < 0 + ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); + + return DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 256-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 256-bit x86 vector +/// shuffle or splits it into two 128-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can + // check for those subtargets here and avoid much of the subtarget querying in + // the per-vector-type lowering routines. With AVX1 we have essentially *zero* + // ability to manipulate a 256-bit vector with integer types. Since we'll use + // floating point types there eventually, just immediately cast everything to + // a float and operate entirely in that domain. + if (VT.isInteger() && !Subtarget->hasAVX2()) { + int ElementBits = VT.getScalarSizeInBits(); + if (ElementBits < 32) + // No floating point type available, decompose into 128-bit vectors. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), + VT.getVectorNumElements()); + V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + } + + switch (VT.SimpleTy) { + case MVT::v4f64: + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i64: + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8f32: + return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i32: + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i16: + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i8: + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Not a valid 256-bit x86 vector type!"); + } +} + +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + 0, 16, 1, 17, 4, 20, 5, 21, + 8, 24, 9, 25, 12, 28, 13, 29)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); + if (isShuffleEquivalent(Mask, + 2, 18, 3, 19, 6, 22, 7, 23, + 10, 26, 11, 27, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + 0, 16, 1, 17, 4, 20, 5, 21, + 8, 24, 9, 25, 12, 28, 13, 29)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); + if (isShuffleEquivalent(Mask, + 2, 18, 3, 19, 6, 22, 7, 23, + 10, 26, 11, 27, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 16-bit integer shuffles. +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 64-lane 8-bit integer shuffles. +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 512-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 512-bit x86 vector +/// shuffle or splits it into two 256-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/ basic ISA!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Dispatch to each element type for lowering. If we don't have supprot for + // specific element type shuffles at 512 bits, immediately split them and + // lower them. Each lowering routine of a given type is allowed to assume that + // the requisite ISA extensions for that element type are available. + switch (VT.SimpleTy) { + case MVT::v8f64: + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i32: + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i16: + if (Subtarget->hasBWI()) + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + case MVT::v64i8: + if (Subtarget->hasBWI()) + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + + default: + llvm_unreachable("Not a valid 512-bit x86 vector type!"); + } + + // Otherwise fall back on splitting. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); +} + /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -7945,22 +10957,25 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } - // For integer vector shuffles, try to collapse them into a shuffle of fewer - // lanes but wider integers. We cap this to not form integers larger than i64 - // but it might be interesting to form i128 integers to handle flipping the - // low and high halves of AVX 256-bit vectors. - if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - areAdjacentMasksSequential(Mask)) { - SmallVector<int, 8> NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] / 2); - MVT NewVT = - MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), - VT.getVectorNumElements() / 2); - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + // Try to collapse shuffles into using a vector type with fewer elements but + // wider element types. We cap this to not form integers or floating point + // elements wider than 64 bits, but it might be interesting to form i128 + // integers to handle flipping the low and high halves of AVX 256-bit vectors. + SmallVector<int, 16> WidenedMask; + if (VT.getScalarSizeInBits() < 64 && + canWidenShuffleElements(Mask, WidenedMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + // Make sure that the new vector type is legal. For example, v2f64 isn't + // legal on SSE1. + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -7979,7 +10994,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the - // number of uses of V2 in the low half of the vector. + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. When those are equal, try to ensure that the number of odd + // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) @@ -7987,14 +11005,42 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, ++LowV2Elements; else if (M >= 0) ++LowV1Elements; - if (LowV2Elements > LowV1Elements) + if (LowV2Elements > LowV1Elements) { return DAG.getCommutedVectorShuffle(*SVOp); + } else if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + SumV2Indices += i; + else if (SVOp->getMask()[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) { + return DAG.getCommutedVectorShuffle(*SVOp); + } else if (SumV2Indices == SumV1Indices) { + int NumV1OddIndices = 0, NumV2OddIndices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + NumV2OddIndices += i % 2; + else if (SVOp->getMask()[i] >= 0) + NumV1OddIndices += i % 2; + if (NumV2OddIndices < NumV1OddIndices) + return DAG.getCommutedVectorShuffle(*SVOp); + } + } } // For each vector width, delegate to a specialized lowering routine. if (VT.getSizeInBits() == 128) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (VT.getSizeInBits() == 256) + return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + // Force AVX-512 vectors to be scalarized for now. + // FIXME: Implement AVX-512 support! + if (VT.getSizeInBits() == 512) + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + llvm_unreachable("Unimplemented!"); } @@ -9060,7 +12106,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, // should assume we're changing V2's element's place and behave // accordingly. int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); - if (FromV1 == FromV2 && DestIndex == Mask[DestIndex] % 4) { + assert(DestIndex <= INT32_MAX && "truncated destination index"); + if (FromV1 == FromV2 && + static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { From = V2; To = V1; DestIndex = @@ -9163,37 +12211,6 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); - // Simplify the operand as it's prepared to be fed into shuffle. - unsigned SignificantBits = NVT.getSizeInBits() >> Shift; - if (V1.getOpcode() == ISD::BITCAST && - V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0).getOperand(0) - .getSimpleValueType().getSizeInBits() == SignificantBits) { - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); - ConstantSDNode *CIdx = - dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); - // If it's foldable, i.e. normal load with single use, we will let code - // selection to fold it. Otherwise, we will short the conversion sequence. - if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - MVT FullVT = V.getSimpleValueType(); - MVT V1VT = V1.getSimpleValueType(); - if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { - // The "ext_vec_elt" node is wider than the result node. - // In this case we should extract subvector from V. - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); - MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), - FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, - DAG.getIntPtrConstant(0)); - } - V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); - } - } - return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } @@ -9343,7 +12360,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, @@ -9355,6 +12372,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePALIGNRImmediate(SVOp), DAG); + if (isVALIGNMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, + getShuffleVALIGNImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -9520,7 +12542,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); } @@ -9639,9 +12661,10 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -// Try to lower a vselect node into a simple blend instruction. -static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); @@ -9683,7 +12706,14 @@ static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) + return SDValue(); + + SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -9696,6 +12726,8 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { break; case MVT::v8i16: case MVT::v16i16: + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + break; return SDValue(); } @@ -9914,62 +12946,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } -static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - SDLoc dl(Op); - - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDValue N2 = Op.getOperand(2); - - if (!VT.is128BitVector()) - return SDValue(); - - if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && - isa<ConstantSDNode>(N2)) { - unsigned Opc; - if (VT == MVT::v8i16) - Opc = X86ISD::PINSRW; - else if (VT == MVT::v16i8) - Opc = X86ISD::PINSRB; - else - Opc = X86ISD::PINSRB; - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); - } - - if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into these - // bits. For example (insert (extract, 3), 2) could be matched by putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may - // combine either bitwise AND or insert of float 0.0 to set these bits. - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); - // Create this as a scalar to vector.. - N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } - - if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { - // PINSR* works with constant index. - return Op; - } - return SDValue(); -} - /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. -SDValue +SDValue X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); @@ -9982,7 +12961,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { // insert element and then truncate the result. MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); - SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); @@ -10001,11 +12980,12 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(MaxSift - IdxVal, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } -SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + +SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - + if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); @@ -10013,20 +12993,20 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + auto *N2C = cast<ConstantSDNode>(N2); + unsigned IdxVal = N2C->getZExtValue(); - // If this is a 256-bit vector result, first extract the 128-bit vector, - // insert the element into the extracted half and then place it back. + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert + // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - if (!isa<ConstantSDNode>(N2)) - return SDValue(); - // Get the desired 128-bit vector half. - unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); @@ -10034,20 +13014,60 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } + assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget->hasSSE41()) - return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + if (Subtarget->hasSSE41()) { + if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { + unsigned Opc; + if (VT == MVT::v8i16) { + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8); + Opc = X86ISD::PINSRB; + } + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these + // bits. For example (insert (extract, 3), 2) could be matched by + // putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(IdxVal << 4); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::i32 || EltVT == MVT::i64) { + // PINSR* works with constant index. + return Op; + } + } if (EltVT == MVT::i8) return SDValue(); - if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { + if (EltVT.getSizeInBits() == 16) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + N2 = DAG.getIntPtrConstant(IdxVal); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); @@ -10360,6 +13380,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI->setAdjustsStack(true); + MFI->setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); @@ -10593,7 +13614,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, MachinePointerInfo(), MVT::i32, - false, false, 0); + false, false, false, 0); else IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), false, false, false, 0); @@ -10677,9 +13698,17 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDLoc dl(Op); - if (SrcVT.isVector()) + if (SrcVT.isVector()) { + if (SrcVT.getVectorElementType() == MVT::i1) { + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, + Op.getOperand(0))); + } return SDValue(); + } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); @@ -10693,7 +13722,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } - SDLoc dl(Op); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); @@ -10880,19 +13908,135 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // The algorithm is the following: + // #ifdef __SSE4_1__ + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + // #else + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + // #endif + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // return (float4) lo + fhi; + + SDLoc DL(Op); + SDValue V = Op->getOperand(0); + EVT VecIntVT = V.getValueType(); + bool Is128 = VecIntVT == MVT::v4i32; + EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + // If we convert to something else than the supported type, e.g., to v4f64, + // abort early. + if (VecFloatVT != Op->getValueType(0)) + return SDValue(); + + unsigned NumElts = VecIntVT.getVectorNumElements(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + assert(NumElts <= 8 && "The size of the constant array must be fixed"); + + // In the #idef/#else code, we have in common: + // - The vector of constants: + // -- 0x4b000000 + // -- 0x53000000 + // - A shift: + // -- v >> 16 + + // Create the splat vector for 0x4b000000. + SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); + SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, + CstLow, CstLow, CstLow, CstLow}; + SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstLowArray[0], NumElts)); + // Create the splat vector for 0x53000000. + SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); + SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, + CstHigh, CstHigh, CstHigh, CstHigh}; + SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstHighArray[0], NumElts)); + + // Create the right shift. + SDValue CstShift = DAG.getConstant(16, MVT::i32); + SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, + CstShift, CstShift, CstShift, CstShift}; + SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); + + SDValue Low, High; + if (Subtarget.hasSSE41()) { + EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + SDValue VecCstLowBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); + // Low will be bitcasted right away, so do not bother bitcasting back to its + // original type. + Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, + VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + SDValue VecCstHighBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); + // High will be bitcasted right away, so do not bother bitcasting back to + // its original type. + High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, + VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); + } else { + SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); + SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, + CstMask, CstMask, CstMask); + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); + Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); + + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); + } + + // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). + SDValue CstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); + SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, + CstFAdd, CstFAdd, CstFAdd, CstFAdd}; + SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, + makeArrayRef(&CstFAddArray[0], NumElts)); + + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); + SDValue FHigh = + DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); + // return (float4) lo + fhi; + SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); + return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); +} + SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); - assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || - SVT == MVT::v8i8 || SVT == MVT::v8i16) && - "Custom UINT_TO_FP is not supported!"); - - MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + switch (SVT.SimpleTy) { + default: + llvm_unreachable("Custom UINT_TO_FP is not supported!"); + case MVT::v4i8: + case MVT::v4i16: + case MVT::v8i8: + case MVT::v8i16: { + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + } + case MVT::v4i32: + case MVT::v8i32: + return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + } + llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -10978,7 +14122,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, 4); + MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); @@ -11192,12 +14336,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) { assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && "Invalid scalar TRUNCATE operation"); - if (InVT == MVT::i32) + if (InVT.getSizeInBits() >= 32) return SDValue(); - if (InVT.getSizeInBits() == 64) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In); - else if (InVT.getSizeInBits() < 32) - In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); + In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); return DAG.getNode(ISD::TRUNCATE, DL, VT, In); } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && @@ -11215,7 +14356,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; } - + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, getPointerTy()); @@ -11375,58 +14516,47 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); +/// The only differences between FABS and FNEG are the mask and the logic op. +/// FNEG also has a folding opportunity for FNEG(FABS(x)). +static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { + assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && + "Wrong opcode for lowering FABS or FNEG."); + + bool IsFABS = (Op.getOpcode() == ISD::FABS); + + // If this is a FABS and it has an FNEG user, bail out to fold the combination + // into an FNABS. We'll lower the FABS after that if it is still in use. + if (IsFABS) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FNEG) + return Op; + + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + // Assume scalar op for initialization; update for vector if needed. + // Note that there are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to + // decide if we should generate a 16-byte constant mask when we only need 4 or + // 8 bytes for the scalar case. if (VT.isVector()) { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, ~(1ULL << 63)))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, ~(1U << 31)))); - C = ConstantVector::getSplat(NumElts, C); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - if (VT.isVector()) { - MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::AND, dl, ANDVT, - DAG.getNode(ISD::BITCAST, dl, ANDVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); - } - return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); -} -static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { + unsigned EltBits = EltVT.getSizeInBits(); LLVMContext *Context = DAG.getContext(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT; - unsigned NumElts = VT == MVT::f64 ? 2 : 4; - if (VT.isVector()) { - EltVT = VT.getVectorElementType(); - NumElts = VT.getVectorNumElements(); - } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, 1ULL << 63))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, 1U << 31))); + // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... + APInt MaskElt = + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); @@ -11434,16 +14564,24 @@ static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); + if (VT.isVector()) { - MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); + // For a vector, cast operands to a vector type, perform the logic op, + // and cast the result back to the original value type. + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); + SDValue Operand = IsFNABS ? + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); + unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, XORVT, - DAG.getNode(ISD::BITCAST, dl, XORVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); + DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } - return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + // If not vector, then scalar. + unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; + return DAG.getNode(BitOp, dl, VT, Operand, Mask); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -11469,19 +14607,17 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. - // First get the sign bit of second operand. - SmallVector<Constant*,4> CV; - if (SrcVT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); - } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - } + const fltSemantics &Sem = + VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + const unsigned SizeInBits = VT.getSizeInBits(); + + SmallVector<Constant *, 4> CV( + VT == MVT::f64 ? 2 : 4, + ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); + + // First, clear all bits but the sign bit from the second operand (sign). + CV[0] = ConstantFP::get(*Context, + APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, @@ -11489,40 +14625,30 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); - // Shift sign bit right or left if the two operands have different types. - if (SrcVT.bitsGT(VT)) { - // Op0 is MVT::f32, Op1 is MVT::f64. - SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); - SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, - DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); - SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, - DAG.getIntPtrConstant(0)); - } - - // Clear first operand sign bit. - CV.clear(); - if (VT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(64, ~(1ULL << 63))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); + // Next, clear the sign bit from the first operand (magnitude). + // If it's a constant, we can clear it here. + if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { + APFloat APF = Op0CN->getValueAPF(); + // If the magnitude is a positive zero, the sign bit alone is enough. + if (APF.isPosZero()) + return SignBit; + APF.clearSign(); + CV[0] = ConstantFP::get(*Context, APF); } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(32, ~(1U << 31))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV[0] = ConstantFP::get( + *Context, + APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); - SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); - - // Or the value with the sign bit. + SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + // If the magnitude operand wasn't a constant, we need to AND out the sign. + if (!isa<ConstantFPSDNode>(Op0)) + Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); + + // OR the magnitude value with the sign bit. return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } @@ -11537,8 +14663,7 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } -// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. -// +// Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); @@ -11897,12 +15022,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); } - + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Do the comparison at i32 if it's smaller, besides the Atom case. - // This avoids subregister aliasing issues. Keep the smaller reference - // if we're optimizing for size, however, as that'll allow better folding + // Do the comparison at i32 if it's smaller, besides the Atom case. + // This avoids subregister aliasing issues. Keep the smaller reference + // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( @@ -11946,6 +15071,66 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // reciprocal estimate with refinement on x86 prior to FMA requires + // 15 instructions: convert to single, rcpss, convert back to double, refine + // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = ReciprocalEstimateRefinementSteps; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); @@ -12105,7 +15290,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); @@ -12219,11 +15404,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, EVT OpVT = Op1.getValueType(); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || + (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, - // But there is no compare instruction for i8 and i16 elements. + // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && @@ -12426,8 +15612,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { cast<ConstantSDNode>(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) + if (NewSetCC.getNode()) { + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; + } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of @@ -12729,18 +15918,40 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + MVT VTElt = VT.getVectorElementType(); + MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); + // SKX processor + if ((InVTElt == MVT::i1) && + (((Subtarget->hasBWI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasBWI() && VT.is512BitVector() && + VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasDQI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + + ((Subtarget->hasDQI() && VT.is512BitVector() && + VTElt.getSizeInBits() >= 32)))) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) + return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); @@ -12768,7 +15979,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return LowerSIGN_EXTEND_AVX512(Op, DAG); + return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && @@ -12811,6 +16022,208 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +// Lower vector extended loads using a shuffle. If SSSE3 is not available we +// may emit an illegal shuffle but the expansion is still better than scalar +// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// we'll emit a shuffle and a arithmetic shift. +// TODO: It is possible to support ZExt by zeroing the undef values during +// the shuffle phase or after the shuffle. +static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT RegVT = Op.getSimpleValueType(); + assert(RegVT.isVector() && "We only custom lower vector sext loads."); + assert(RegVT.isInteger() && + "We only custom lower integer vector sext loads."); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); + + LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); + SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + // Finally, do a normal sign-extend to the desired register. + return DAG.getSExtOrTrunc(Load, dl, RegVT); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize / MemVT.getScalarType().getSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = + DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1, we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && + "We can't implement a sext load without an arithmetic right shift!"); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; + + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + Shuff = + DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; +} + // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. @@ -13116,7 +16529,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. -// Calls to _alloca is needed to probe the stack when allocating more than 4k +// Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. @@ -13125,7 +16538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || SplitStack; SDLoc dl(Op); @@ -13151,7 +16564,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering(); + const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -13174,7 +16587,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + EVT SPTy = getPointerTy(); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -13192,7 +16605,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = - getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -13201,7 +16614,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; - unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); @@ -13209,8 +16622,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -13451,7 +16864,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { - assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + MVT SVT = ShAmt.getSimpleValueType(); + assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) @@ -13466,13 +16880,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } - // Need to build a vector containing shift amount - // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); + const X86Subtarget &Subtarget = + DAG.getTarget().getSubtarget<X86Subtarget>(); + if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { + // Let the shuffle legalizer expand this shift amount node. + SDValue Op0 = ShAmt.getOperand(0); + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); + ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); + } else { + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + SmallVector<SDValue, 4> ShOps; + ShOps.push_back(ShAmt); + if (SVT == MVT::i32) { + ShOps.push_back(DAG.getConstant(0, SVT)); + ShOps.push_back(DAG.getUNDEF(SVT)); + } + ShOps.push_back(DAG.getUNDEF(SVT)); + + MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); + } // The return type has to be a 128-bit type with the same element // type as the input type. @@ -13483,382 +16912,271 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - SDLoc dl(Op); - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. - // Comparison intrinsics. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomineq_sd: { - unsigned Opc; - ISD::CondCode CC; +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting for \p Mask when lowering masking intrinsics. +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + + assert(MaskVT.isSimple() && "invalid mask type"); + + if (isAllOnes(Mask)) + return Op; + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +} + +/// \brief Creates an SDNode for a predicated scalar operation. +/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). +/// The mask is comming as MVT::i8 and it should be truncated +/// to MVT::i1 while lowering masking intrinsics. +/// The main difference between ScalarMaskingNode and VectorMaskingNode is using +/// "X86select" instead of "vselect". We just can't create the "vselect" node for +/// a scalar instruction. +static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (isAllOnes(Mask)) + return Op; + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); +} + +static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse2_comieq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse2_comilt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse2_comile_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse2_comigt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse2_comige_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse2_comineq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETNE; - break; - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse2_ucomieq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse2_ucomilt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse2_ucomile_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse2_ucomigt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse2_ucomige_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_ucomineq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETNE; - break; + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + return X86ISD::FMADD; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + return X86ISD::FMSUB; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + return X86ISD::FNMADD; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + return X86ISD::FNMSUB; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + return X86ISD::FMADDSUB; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: + return X86ISD::FMSUBADD; } +} - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); - assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); - SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); - } +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { + switch(IntrData->Type) { + case INTR_TYPE_1OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + case INTR_TYPE_2OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + case INTR_TYPE_3OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_1OP_MASK_RM: { + SDValue Src = Op.getOperand(1); + SDValue Src0 = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue RoundingMode = Op.getOperand(4); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src0 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue RoundingMode = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_2OP_MASK: { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), + Op.getOperand(2)), + Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); + } + case CMP_MASK: + case CMP_MASK_CC: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) + EVT VT = Op.getOperand(1).getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue Cmp; + if (IntrData->Type == CMP_MASK_CC) { + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } else { + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); + } + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + case COMI: { // Comparison intrinsics + ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case VSHIFT: + return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + case VSHIFT_MASK: + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), + Op.getOperand(1), + Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, + DAG); + case COMPRESS_EXPAND_IN_REG: { + SDValue Mask = Op.getOperand(3); + SDValue DataToCompress = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + if (isAllOnes(Mask)) // return data as is + return Op.getOperand(1); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); - // Arithmetic intrinsics. - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_avx2_pmulu_dq: - return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulhu_w: - case Intrinsic::x86_avx2_pmulhu_w: - return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulh_w: - case Intrinsic::x86_avx2_pmulh_w: - return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE2/AVX2 sub with unsigned saturation intrinsics - case Intrinsic::x86_sse2_psubus_b: - case Intrinsic::x86_sse2_psubus_w: - case Intrinsic::x86_avx2_psubus_b: - case Intrinsic::x86_avx2_psubus_w: - return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE3/AVX horizontal add/sub intrinsics - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - Opcode = X86ISD::FHADD; - break; - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - Opcode = X86ISD::FHSUB; - break; - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - Opcode = X86ISD::HADD; - break; - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: - Opcode = X86ISD::HSUB; - break; + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, + PassThru); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE2/SSE41/AVX2 integer max/min intrinsics. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - Opcode = X86ISD::UMAX; - break; - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - Opcode = X86ISD::UMIN; - break; - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - Opcode = X86ISD::SMAX; - break; - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: - Opcode = X86ISD::SMIN; - break; + case BLEND: { + SDValue Mask = Op.getOperand(3); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), + Op.getOperand(2)); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE/SSE2/AVX floating point max/min intrinsics. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - Opcode = X86ISD::FMAX; - break; - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: - Opcode = X86ISD::FMIN; - break; + case FMA_OP_MASK: + { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // AVX2 variable shift intrinsics - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - Opcode = ISD::SHL; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - Opcode = ISD::SRL; - break; - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - Opcode = ISD::SRA; + default: break; } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx2_packusdw: - return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshuf_d: - return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufl_w: - return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufh_w: - return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_psign_b_128: - case Intrinsic::x86_ssse3_psign_w_128: - case Intrinsic::x86_ssse3_psign_d_128: - case Intrinsic::x86_avx2_psign_b: - case Intrinsic::x86_avx2_psign_w: - case Intrinsic::x86_avx2_psign_d: - return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse41_insertps: - return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::x86_avx_vperm2f128_ps_256: - case Intrinsic::x86_avx_vperm2f128_pd_256: - case Intrinsic::x86_avx_vperm2f128_si_256: - case Intrinsic::x86_avx2_vperm2i128: - return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + } - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - // Operands intentionally swapped. Mask is last operand to intrinsic, - // but second operand for node/instruction. - return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(1)); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::x86_sse_sqrt_ps: - case Intrinsic::x86_sse2_sqrt_pd: - case Intrinsic::x86_avx_sqrt_ps_256: - case Intrinsic::x86_avx_sqrt_pd_256: - return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: + case Intrinsic::x86_avx512_mask_valign_d_512: + // Vector source operands are swapped. + return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, + Op.getValueType(), Op.getOperand(2), + Op.getOperand(1), + Op.getOperand(3)), + Op.getOperand(5), Op.getOperand(4), + Subtarget, DAG); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -13936,100 +17254,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // SSE/AVX shift intrinsics - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - Opcode = X86ISD::VSHL; - break; - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - Opcode = X86ISD::VSRL; - break; - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: - Opcode = X86ISD::VSRA; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE/AVX immediate shift intrinsics - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - Opcode = X86ISD::VSHLI; - break; - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - Opcode = X86ISD::VSRLI; - break; - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - Opcode = X86ISD::VSRAI; - break; - } - return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); - } - case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -14106,6 +17330,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { + auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); + if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); + else + return SDValue(); + } + case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmsub_ps: @@ -14130,74 +17380,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - Opc = X86ISD::FMADD; - break; - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - Opc = X86ISD::FMSUB; - break; - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - Opc = X86ISD::FNMADD; - break; - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - Opc = X86ISD::FNMSUB; - break; - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - Opc = X86ISD::FMADDSUB; - break; - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: - Opc = X86ISD::FMSUBADD; - break; - } - - return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); - } + return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -14382,122 +17566,25 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST -}; - -struct IntrinsicData { - IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1) - :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {} - IntrinsicType Type; - unsigned Opc0; - unsigned Opc1; -}; - -std::map < unsigned, IntrinsicData> IntrMap; -static void InitIntinsicsMap() { - static bool Initialized = false; - if (Initialized) - return; - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512, - IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512, - IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512, - IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, - IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, - IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, - IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, - IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512, - IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, - IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, - IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, - IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm, - X86::VGATHERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm, - X86::VGATHERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm, - X86::VGATHERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm, - X86::VGATHERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm, - X86::VSCATTERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm, - X86::VSCATTERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm, - X86::VSCATTERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm, - X86::VSCATTERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_xtest, - IntrinsicData(XTEST, X86ISD::XTEST, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc, - IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, - IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, - IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); - Initialized = true; -} static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - InitIntinsicsMap(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo); - if (itr == IntrMap.end()) + + const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); + if (!IntrData) return SDValue(); SDLoc dl(Op); - IntrinsicData Intr = itr->second; - switch(Intr.Type) { + switch(IntrData->Type) { + default: + llvm_unreachable("Unknown Intrinsic Type"); + break; case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. @@ -14521,7 +17608,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -14532,7 +17619,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -14540,7 +17627,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (dyn_cast<ConstantSDNode> (Hint) == nullptr || (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); + unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -14551,7 +17638,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector<SDValue, 2> Results; - getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); + getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. @@ -14563,7 +17650,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); - SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_NE, MVT::i8), InTrans); @@ -14571,8 +17658,79 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } + // ADC/ADCX/SBB + case ADX: { + SmallVector<SDValue, 2> Results; + SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); + SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), + DAG.getConstant(-1, MVT::i8)); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), + Op.getOperand(4), GenCF.getValue(1)); + SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), + Op.getOperand(5), MachinePointerInfo(), + false, false, 0); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), + Res.getValue(1)); + Results.push_back(SetCC); + Results.push_back(Store); + return DAG.getMergeValues(Results, dl); + } + case COMPRESS_TO_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToCompress = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + if (isAllOnes(Mask)) // return just a store + return DAG.getStore(Chain, dl, DataToCompress, Addr, + MachinePointerInfo(), false, false, 0); + + EVT VT = DataToCompress.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, + DataToCompress, DAG.getUNDEF(VT)); + return DAG.getStore(Chain, dl, Compressed, Addr, + MachinePointerInfo(), false, false, 0); + } + case EXPAND_FROM_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue PathThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + EVT VT = Op.getValueType(); + + if (isAllOnes(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, + false, 0); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), + false, false, false, 0); + + SmallVector<SDValue, 2> Results; + Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, + PathThru)); + Results.push_back(Chain); + return DAG.getMergeValues(Results, dl); + } } - llvm_unreachable("Unknown Intrinsic Type"); } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, @@ -14589,8 +17747,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -14611,9 +17769,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); - unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister( + DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -14640,8 +17799,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -14652,8 +17811,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -14700,7 +17859,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -14864,7 +18023,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -15198,29 +18357,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - // The internal representation is big endian. - // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0 - // and its low part at index 1. - // Moreover, we have: Mul1 = <ae|cg> ; Mul2 = <bf|dh> - // Vector index 0 1 ; 2 3 - // We want <ae|bf|cg|dh> - // Vector index 0 2 1 3 - // Since each element is seen as 2 x i32, we get: - // high_mask[i] = 2 x vector_index[i] - // low_mask[i] = 2 x vector_index[i] + 1 - // where vector_index = {0, Size/2, 1, Size/2 + 1, ..., - // Size/2 - 1, Size/2 + Size/2 - 1} - // where Size is the number of element of the final vector. SDValue Highs, Lows; if (VT == MVT::v8i32) { - const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } else { - const int HighMask[] = {0, 4, 2, 6}; + const int HighMask[] = {1, 5, 3, 7}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {1, 5, 3, 7}; + const int LowMask[] = {0, 4, 2, 6}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } @@ -15238,9 +18384,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - // The low part of a MUL_LOHI is supposed to be the first value and the - // high part the second value. - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs); + // The first result of MUL_LOHI is actually the low value, followed by the + // high value. + SDValue Ops[] = {Lows, Highs}; + return DAG.getMergeValues(Ops, dl); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -15430,55 +18577,43 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned i, j; - for (i = 0; i != NumElts; ++i) { - if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) - continue; - break; - } - for (j = i; j != NumElts; ++j) { - SDValue Arg = Amt.getOperand(j); - if (Arg.getOpcode() == ISD::UNDEF) continue; - if (Arg != Amt.getOperand(i)) - break; - } - if (i != NumElts && j == NumElts) - BaseShAmt = Amt.getOperand(i); + if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { + // Check if this build_vector node is doing a splat. + // If so, then set BaseShAmt equal to the splat value. + BaseShAmt = BV->getSplatValue(); + if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) + BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); - if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && - cast<ShuffleVectorSDNode>(Amt)->isSplat()) { + + ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); + if (SVN && SVN->isSplat()) { + unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = InVec.getValueType().getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = InVec.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } + assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + "Unexpected shuffle index found!"); + BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { - unsigned SplatIdx = - cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } - if (!BaseShAmt.getNode()) - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, - DAG.getIntPtrConstant(0)); + + if (!BaseShAmt) + // Avoid introducing an extract element from a shuffle. + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, + DAG.getIntPtrConstant(SplatIdx)); } } if (BaseShAmt.getNode()) { - if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); + if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); @@ -15596,7 +18731,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. - if (Op.getOpcode() == ISD::SHL && + if (Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { @@ -15688,15 +18823,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, CanBeSimplified = Amt2 == Amt->getOperand(j); } } - + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. EVT CastVT = MVT::v4i32; - SDValue Splat1 = + SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); - SDValue Splat2 = + SDValue Splat2 = DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) @@ -15851,10 +18986,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Cond = X86::COND_B; break; case ISD::SMULO: - BaseOp = X86ISD::SMUL; + BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + if (N->getValueType(0) == MVT::i8) { + BaseOp = X86ISD::UMUL8; + Cond = X86::COND_O; + break; + } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); @@ -15880,6 +19020,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } +// Sign extension of the low part of vector elements. This may be used either +// when sign extend instructions are not available or if the vector element +// sizes already match the sign-extended size. If the vector elements are in +// their pre-extended size and sign extend instructions are available, that will +// be handled by LowerSIGN_EXTEND. SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -15925,37 +19070,151 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, case MVT::v4i32: case MVT::v8i16: { SDValue Op0 = Op.getOperand(0); - SDValue Op00 = Op0.getOperand(0); - SDValue Tmp1; - // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. - if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { - // (sext (vzext x)) -> (vsext x) - Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); - if (Tmp1.getNode()) { - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - // This folding is only valid when the in-reg type is a vector of i8, - // i16, or i32. - if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || - ExtraEltVT == MVT::i32) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); - } - Op0 = Tmp1; - } - } - // If the above didn't work, then just use Shift-Left + Shift-Right. - Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, - DAG); - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, + // This is a sign extension of some low part of vector elements without + // changing the size of the vector elements themselves: + // Shift-Left + Shift-Right-Algebraic. + SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, + BitsDiff, DAG); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, DAG); } } } +/// Returns true if the operand type is exactly twice the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available. +/// Used to know whether to use cmpxchg8/16b when expanding atomic operations +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + else if (OpWidth == 128) + return Subtarget.hasCmpxchg16b(); + else + return false; +} + +bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return needsCmpXchgNb(SI->getValueOperand()->getType()); +} + +// Note: this turns large loads into lock cmpxchg8b/16b. +// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + return needsCmpXchgNb(PTy->getElementType()); +} + +bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + + // If the operand is too big, we must see if cmpxchg8/16b is available + // and default to library calls otherwise. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return needsCmpXchgNb(MemType); + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more agressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) { + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + } else if (hasMFENCE(Subtarget)) { + Function *MFence = llvm::Intrinsic::getDeclaration(M, + Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence); + } else { + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + } + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -15967,10 +19226,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -16085,6 +19341,139 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + + Op = Op.getOperand(0); + EVT VT = Op.getValueType(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "CTPOP lowering only implemented for 128/256-bit wide vector types"); + + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + unsigned Len = EltVT.getSizeInBits(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: + // + // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled + // v8i32 => Always profitable + // + // FIXME: There a couple of possible improvements: + // + // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). + // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // + assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && + "CTPOP not implemented for this vector element type."); + + // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid + // extra legalization. + bool NeedsBitcast = EltVT == MVT::i32; + MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + + SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); + SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); + SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); + + // v = v - ((v >> 1) & 0x55555555...) + SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT)); + SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); + SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); + if (NeedsBitcast) + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + + SmallVector<SDValue, 8> Mask55(NumElts, Cst55); + SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); + if (NeedsBitcast) + M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); + + SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SmallVector<SDValue, 8> Mask33(NumElts, Cst33); + SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); + SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT)); + SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); + + Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); + if (NeedsBitcast) { + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); + Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); + } + + SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); + SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); + if (VT != AndRHS.getValueType()) { + AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); + AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); + } + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT)); + SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + + SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); + } + And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + // The algorithm mentioned above uses: + // v = (v * 0x01010101...) >> (Len - 8) + // + // Change it to use vector adds + vector shifts which yield faster results on + // Haswell than using vector integer multiplication. + // + // For i32 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // + // For i64 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // v = v + (v >> 32) + // + Add = And; + SmallVector<SDValue, 8> Csts; + for (unsigned i = 8; i <= Len/2; i *= 2) { + Csts.assign(NumElts, DAG.getConstant(i, EltVT)); + SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + Csts.clear(); + } + + // The result is on the least significant 6-bits on i32 and 7-bits on i64. + SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); + SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F); + SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + } + And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + return And; +} + static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); @@ -16181,7 +19570,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); @@ -16212,6 +19601,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -16240,8 +19630,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FABS: return LowerFABS(Op, DAG); - case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::FABS: + case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); @@ -16251,7 +19642,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -16292,29 +19683,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } -static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) { - SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); - - // Convert wide load -> cmpxchg8b/cmpxchg16b - // FIXME: On 32-bit, load -> fild or movq would be more efficient - // (The only way to get a 16-byte load is cmpxchg16b) - // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. - SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); - SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, - Node->getOperand(0), Node->getOperand(1), Zero, Zero, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); - Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(2)); -} - /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -16325,6 +19693,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. + case X86ISD::FMINC: + case X86ISD::FMIN: + case X86ISD::FMAXC: + case X86ISD::FMAX: { + EVT VT = N->getValueType(0); + if (VT != MVT::v2f32) + llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + SDValue UNDEF = DAG.getUNDEF(VT); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); + return; + } case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -16473,12 +19857,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle - // should have already been dealt with by X86AtomicExpand.cpp. + // should have already been dealt with by AtomicExpandPass.cpp. break; - case ISD::ATOMIC_LOAD: { - ReplaceATOMIC_LOAD(N, Results, DAG); - return; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); @@ -16561,8 +19943,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; - case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -16618,6 +20000,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::SMUL8: return "X86ISD::SMUL8"; + case X86ISD::UMUL8: return "X86ISD::UMUL8"; + case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; + case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; @@ -16633,6 +20019,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; + case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; @@ -16652,7 +20039,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; - case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; @@ -16678,6 +20065,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; + case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; + case X86ISD::EXPAND: return "X86ISD::EXPAND"; + case X86ISD::SELECT: return "X86ISD::SELECT"; } } @@ -16868,6 +20258,14 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (VT.getSizeInBits() == 64) return false; + // This is an experimental legality test that is tailored to match the + // legality test of the experimental lowering more closely. They are gated + // separately to ease testing of performance differences. + if (ExperimentalVectorShuffleLegality) + // We only care that the types being shuffled are legal. The lowering can + // handle any possible shuffle mask that results. + return isTypeLegal(SVT); + // If this is a single-input shuffle with no 128 bit lane crossings we can // lower it into pshufb. if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || @@ -16888,9 +20286,12 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isCommutedMOVLMask(M, SVT) || isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || + isSHUFPMask(M, SVT, /* Commuted */ true) || isPSHUFDMask(M, SVT) || + isPSHUFDMask(M, SVT, /* SecondOperand */ true) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || isPALIGNRMask(M, SVT, Subtarget) || @@ -16898,7 +20299,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); + isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || + (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); } bool @@ -16908,6 +20310,14 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return false; MVT SVT = VT.getSimpleVT(); + + // This is an experimental legality test that is tailored to match the + // legality test of the experimental lowering more closely. They are gated + // separately to ease testing of performance differences. + if (ExperimentalVectorShuffleLegality) + // The new vector shuffle lowering is very good at managing zero-inputs. + return isShuffleMaskLegal(Mask, VT); + unsigned NumElts = SVT.getVectorNumElements(); // FIXME: This collection of masks seems suspect. if (NumElts == 2) @@ -16916,7 +20326,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return (isMOVLMask(Mask, SVT) || isCommutedMOVLMask(Mask, SVT, true) || isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true)); + isSHUFPMask(Mask, SVT, /* Commuted */ true) || + isBlendMask(Mask, SVT, Subtarget->hasSSE41(), + Subtarget->hasInt256())); } return false; } @@ -17114,7 +20526,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -17266,7 +20678,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB - BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } @@ -17370,7 +20782,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -17380,7 +20792,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget->isTargetWin64()) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } @@ -17453,7 +20865,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -17479,7 +20891,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -17518,17 +20931,20 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, - bool Is64Bit) const { +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); - unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; - unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + const bool Is64Bit = Subtarget->is64Bit(); + const bool IsLP64 = Subtarget->isTarget64BitLP64(); + + const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] @@ -17552,14 +20968,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), - physSPReg = Is64Bit ? X86::RSP : X86::ESP; + physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = BB; ++MBBIter; @@ -17575,12 +20991,12 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -17588,12 +21004,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); - BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. - const uint32_t *RegMask = - MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); - if (Is64Bit) { + const uint32_t *RegMask = MF->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); + if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) @@ -17601,6 +21019,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); + } else if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); @@ -17616,8 +21042,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) - .addReg(Is64Bit ? X86::RAX : X86::EAX); - BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + .addReg(IsLP64 ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); @@ -17641,10 +21067,10 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMacho()); + assert(!Subtarget->isTargetMachO()); // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. @@ -17674,8 +21100,10 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, .addReg(X86::RAX); } } else { - const char *StackProbeSymbol = - Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca"; + const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || + Subtarget->isTargetWindowsItanium()) + ? "_chkstk" + : "_alloca"; BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol(StackProbeSymbol) @@ -17698,8 +21126,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII - = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo()); + const X86InstrInfo *TII = + static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); @@ -17708,8 +21136,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. - const uint32_t *RegMask = - F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = F->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -17754,7 +21184,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -17795,6 +21225,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // v = phi(main, restore) // // restoreMBB: + // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; @@ -17860,8 +21291,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -17878,8 +21309,20 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: + if (RegInfo->hasBasePointer(*MF)) { + const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>(); + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + X86FI->setRestoreBasePointer(MF); + unsigned FramePtr = RegInfo->getFrameRegister(*MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; + addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .setMIFlag(MachineInstr::FrameSetup); + } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); - BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI->eraseFromParent(); @@ -17891,7 +21334,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -17906,8 +21349,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -17951,7 +21394,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer -// to remove extra copies in the loop. +// to remove extra copies in the loop. MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -18006,6 +21449,11 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; + case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; + case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; + case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; + case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; + case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; @@ -18014,10 +21462,14 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; + case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; + case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; + case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; + case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; default: llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) @@ -18048,9 +21500,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::SEG_ALLOCA_32: - return EmitLoweredSegAlloca(MI, BB, false); case X86::SEG_ALLOCA_64: - return EmitLoweredSegAlloca(MI, BB, true); + return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -18083,7 +21534,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = F->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" @@ -18167,7 +21618,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -18180,15 +21631,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget); + return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), + Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -18204,6 +21656,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); + case TargetOpcode::STATEPOINT: + // As an implementation detail, STATEPOINT shares the STACKMAP format at + // this point in the process. We diverge later. + return emitPatchPoint(MI, BB); + case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); @@ -18224,6 +21681,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMSUBPSr213r: case X86::VFNMSUBSDr213r: case X86::VFNMSUBSSr213r: + case X86::VFMADDSUBPDr213r: + case X86::VFMADDSUBPSr213r: + case X86::VFMSUBADDPDr213r: + case X86::VFMSUBADDPSr213r: case X86::VFMADDPDr213rY: case X86::VFMADDPSr213rY: case X86::VFMSUBPDr213rY: @@ -18232,6 +21693,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMADDPSr213rY: case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPSr213rY: + case X86::VFMADDSUBPDr213rY: + case X86::VFMADDSUBPSr213rY: + case X86::VFMSUBADDPDr213rY: + case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); } } @@ -18461,6 +21926,329 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Combine an arbitrary chain of shuffles into a single instruction if +/// possible. +/// +/// This is the leaf of the recursive combinine below. When we have found some +/// chain of single-use x86 shuffle instructions and accumulated the combined +/// shuffle mask represented by them, this will try to pattern match that mask +/// into either a single instruction if there is a special purpose instruction +/// for this operation, or into a PSHUFB instruction which is a fully general +/// instruction but should only be used to replace chains over a certain depth. +static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); + + // Find the operand that enters the chain. Note that multiple uses are OK + // here, we're not going to remove the operand we find. + SDValue Input = Op.getOperand(0); + while (Input.getOpcode() == ISD::BITCAST) + Input = Input.getOperand(0); + + MVT VT = Input.getSimpleValueType(); + MVT RootVT = Root.getSimpleValueType(); + SDLoc DL(Root); + + // Just remove no-op shuffle masks. + if (Mask.size() == 1) { + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), + /*AddTo*/ true); + return true; + } + + // Use the float domain if the operand type is a floating point type. + bool FloatDomain = VT.isFloatingPoint(); + + // For floating point shuffles, we don't have free copies in the shuffle + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. + // + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + if (FloatDomain) { + if (Mask.equals(0, 0) || Mask.equals(1, 1)) { + bool Lo = Mask.equals(0, 0); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Subtarget->hasSSE3() && + (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { + bool Lo = Mask.equals(0, 0, 2, 2); + unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { + bool Lo = Mask.equals(0, 0, 1, 1); + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + } + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && + (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || + Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) + return false; + + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { + SmallVector<SDValue, 16> PSHUFBMask; + assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); + int Ratio = 16 / Mask.size(); + for (unsigned i = 0; i < 16; ++i) { + if (Mask[i / Ratio] == SM_SentinelUndef) { + PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + int M = Mask[i / Ratio] != SM_SentinelZero + ? Ratio * Mask[i / Ratio] + i % Ratio + : 255; + PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); + } + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + DCI.AddToWorklist(Op.getNode()); + SDValue PSHUFBMaskOp = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DCI.AddToWorklist(PSHUFBMaskOp.getNode()); + Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Failed to find any combines. + return false; +} + +/// \brief Fully generic combining of x86 shuffle instructions. +/// +/// This should be the last combine run over the x86 shuffle instructions. Once +/// they have been fully optimized, this will recursively consider all chains +/// of single-use shuffle instructions, build a generic model of the cumulative +/// shuffle operation, and check for simpler instructions which implement this +/// operation. We use this primarily for two purposes: +/// +/// 1) Collapse generic shuffles to specialized single instructions when +/// equivalent. In most cases, this is just an encoding size win, but +/// sometimes we will collapse multiple generic shuffles into a single +/// special-purpose shuffle. +/// 2) Look for sequences of shuffle instructions with 3 or more total +/// instructions, and replace them with the slightly more expensive SSSE3 +/// PSHUFB instruction if available. We do this as the last combining step +/// to ensure we avoid using PSHUFB if we can implement the shuffle with +/// a suitable short sequence of other instructions. The PHUFB will either +/// use a register or have to read from memory and so is slightly (but only +/// slightly) more expensive than the other shuffle instructions. +/// +/// Because this is inherently a quadratic operation (for each shuffle in +/// a chain, we recurse up the chain), the depth is limited to 8 instructions. +/// This should never be an issue in practice as the shuffle lowering doesn't +/// produce sequences of more than 8 instructions. +/// +/// FIXME: We will currently miss some cases where the redundant shuffling +/// would simplify under the threshold for PSHUFB formation because of +/// combine-ordering. To fix this, we should do the redundant instruction +/// combining in this recursive walk. +static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, + ArrayRef<int> RootMask, + int Depth, bool HasPSHUFB, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // Bound the depth of our recursive combine because this is ultimately + // quadratic in nature. + if (Depth > 8) + return false; + + // Directly rip through bitcasts to find the underlying operand. + while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) + Op = Op.getOperand(0); + + MVT VT = Op.getSimpleValueType(); + if (!VT.isVector()) + return false; // Bail if we hit a non-vector. + // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit + // version should be added. + if (VT.getSizeInBits() != 128) + return false; + + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && + "Can only combine shuffles of the same vector register size."); + + if (!isTargetShuffle(Op.getOpcode())) + return false; + SmallVector<int, 16> OpMask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + // We only can combine unary shuffles which we can decode the mask for. + if (!HaveMask || !IsUnary) + return false; + + assert(VT.getVectorNumElements() == OpMask.size() && + "Different mask size from vector size!"); + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); + int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); + assert(((RootRatio == 1 && OpRatio == 1) || + (RootRatio == 1) != (OpRatio == 1)) && + "Must not have a ratio for both incoming and op masks!"); + + SmallVector<int, 16> Mask; + Mask.reserve(std::max(OpMask.size(), RootMask.size())); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by the + // root mask to get us all the way to the root value arrangement. The reason + // for this order is that we are recursing up the operation chain. + for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + int RootIdx = i / RootRatio; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask.push_back(RootMask[RootIdx]); + continue; + } + + int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + int OpIdx = RootMaskedIdx / OpRatio; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask.push_back(OpMask[OpIdx]); + continue; + } + + // Ok, we have non-zero lanes, map them through. + Mask.push_back(OpMask[OpIdx] * OpRatio + + RootMaskedIdx % OpRatio); + } + + // See if we can recurse into the operand to combine more things. + switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + } + + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with squential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 16> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); + } + + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); +} + /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 @@ -18493,19 +22281,23 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. -static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); - // Walk up a single-use chain looking for a combinable shuffle. + // Walk up a single-use chain looking for a combinable shuffle. Keep a stack + // of the shuffles in the chain so that we can form a fresh chain to replace + // this one. + SmallVector<SDValue, 8> Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: - return false; // Nothing combined! + return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific @@ -18521,8 +22313,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::PSHUFHW: @@ -18530,8 +22323,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::UNPCKL: @@ -18539,25 +22333,28 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) - return false; + return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) - return false; + return SDValue(); + Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: - return false; // Nothing to combine. + return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; + Chain.push_back(V); + // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); @@ -18573,10 +22370,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. - return false; - - // Record the old value to use in RAUW-ing. - SDValue Old = V; + return SDValue(); // Merge this node's mask and our incoming mask. SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); @@ -18585,20 +22379,34 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // It is possible that one of the combinable shuffles was completely absorbed - // by the other, just replace it and revisit all users in that case. - if (Old.getNode() == V.getNode()) { - DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); - return true; - } + // Rebuild the chain around this new shuffle. + while (!Chain.empty()) { + SDValue W = Chain.pop_back_val(); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + if (V.getValueType() != W.getOperand(0).getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - return true; + switch (W.getOpcode()) { + default: + llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); + break; + + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); + break; + } + } + if (V.getValueType() != N.getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); + + // Return the new chain to replace N. + return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. @@ -18634,26 +22442,6 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // Other-half shuffles are no-ops. continue; - - case X86ISD::PSHUFD: { - // We can only handle pshufd if the half we are combining either stays in - // its half, or switches to the other half. Bail if one of these isn't - // true. - SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); - int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; - if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || - (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) - return false; - - // Map the mask through the pshufd and keep walking up the chain. - for (int i = 0; i < 4; ++i) - Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; - - // Switch halves if the pshufd does. - CombineOpcode = - VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; - continue; - } } // Break out of the loop if we break out of the switch. break; @@ -18663,7 +22451,11 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // We fell out of the loop without finding a viable combining instruction. return false; - // Record the old value to use in RAUW-ing. + // Combine away the bottom node as its shuffle will be accumulated into + // a preceding shuffle. + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all @@ -18674,12 +22466,13 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + // Check that the shuffles didn't cancel each other out. If not, we need to + // combine to the new one. + if (Old != V) + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } @@ -18720,13 +22513,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && - areAdjacentMasksSequential(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, @@ -18779,8 +22572,8 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) - return SDValue(); // We combined away this shuffle. + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return NewN; break; } @@ -18788,6 +22581,61 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); } +/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// +/// We combine this directly on the abstract vector shuffle nodes so it is +/// easier to generically match. We also insert dummy vector shuffle nodes for +/// the operands which explicitly discard the lanes which are unused by this +/// operation to try to flow through the rest of the combiner the fact that +/// they're unused. +static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // We only handle target-independent shuffles. + // FIXME: It would be easy and harmless to use the target shuffle mask + // extraction tool to support more. + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *SVN = cast<ShuffleVectorSDNode>(N); + ArrayRef<int> Mask = SVN->getMask(); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + // We require the first shuffle operand to be the SUB node, and the second to + // be the ADD node. + // FIXME: We should support the commuted patterns. + if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + return SDValue(); + + // If there are other uses of these operations we can't fold them. + if (!V1->hasOneUse() || !V2->hasOneUse()) + return SDValue(); + + // Ensure that both operations have the same operands. Note that we can + // commute the FADD operands. + SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); + if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && + (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) + return SDValue(); + + // We're looking for blends between FADD and FSUB nodes. We insist on these + // nodes being lined up in a specific expected pattern. + if (!(isShuffleEquivalent(Mask, 0, 3) || + isShuffleEquivalent(Mask, 0, 5, 2, 7) || + isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + return SDValue(); + + // Only specific types are legal at this point, assert so we notice if and + // when these change. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || + VT == MVT::v4f64) && + "Unknown vector type encountered!"); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -18797,54 +22645,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - // Canonicalize shuffles that perform 'addsub' on packed float vectors - // according to the rule: - // (shuffle (FADD A, B), (FSUB A, B), Mask) -> - // (shuffle (FSUB A, -B), (FADD A, -B), Mask) - // - // Where 'Mask' is: - // <0,5,2,7> -- for v4f32 and v4f64 shuffles; - // <0,3> -- for v2f64 shuffles; - // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. - // - // This helps pattern-matching more SSE3/AVX ADDSUB instructions - // during ISel stage. - if (N->getOpcode() == ISD::VECTOR_SHUFFLE && - ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && - // Operands to the FADD and FSUB must be the same. - ((N0->getOperand(0) == N1->getOperand(0) && - N0->getOperand(1) == N1->getOperand(1)) || - // FADD is commutable. See if by commuting the operands of the FADD - // we would still be able to match the operands of the FSUB dag node. - (N0->getOperand(1) == N1->getOperand(0) && - N0->getOperand(0) == N1->getOperand(1))) && - N0->getOperand(0)->getOpcode() != ISD::UNDEF && - N0->getOperand(1)->getOpcode() != ISD::UNDEF) { - - ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); - unsigned NumElts = VT.getVectorNumElements(); - ArrayRef<int> Mask = SV->getMask(); - bool CanFold = true; - - for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) - CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); - - if (CanFold) { - SDValue Op0 = N1->getOperand(0); - SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); - SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); - SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); - return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); - } - } - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB node. + if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) + if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + return AddSub; + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) @@ -18869,7 +22680,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); - + if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { @@ -18921,6 +22732,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); if (Shuffle.getNode()) return Shuffle; + + // Try recursively combining arbitrary sequences of x86 shuffle + // instructions into higher-order shuffles. We do this after combining + // specific PSHUF instruction sequences into their minimal form so that we + // can evaluate how many specialized shuffle instructions are involved in + // a particular chain. + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. } return SDValue(); @@ -18938,7 +22761,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been customed lowered so we need to handle those here. +/// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) @@ -18950,20 +22773,20 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!isa<ConstantSDNode>(EltNo)) return SDValue(); - EVT VT = InVec.getValueType(); + EVT OriginalVT = InVec.getValueType(); - bool HasShuffleIntoBitcast = false; if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); - HasShuffleIntoBitcast = true; } + EVT CurrentVT = InVec.getValueType(); + if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -18973,12 +22796,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, - UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); + unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) @@ -19004,35 +22827,37 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - if (HasShuffleIntoBitcast) { - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getDataLayout()-> - getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + EVT EltVT = N->getValueType(0); + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( + EltVT.getTypeForEVT(*DAG.getContext())); - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return SDValue(); - } + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) + return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); - Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) + : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); - Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts -/// to a simple store and scalar loads to extract the elements. +/// into a somewhat faster sequence. For i686, the best sequence is apparently +/// storing the value and loading scalars back, while for x64 we should +/// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); @@ -19091,36 +22916,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Ok, we've now decided to do the transformation. + // If 64-bit shifts are legal, use the extract-shift sequence, + // otherwise bounce the vector off the cache. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Vals[4]; SDLoc dl(InputVector); - // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo(), false, false, 0); + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { + SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(0, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(1, VecIdxTy)); + + SDValue ShAmt = DAG.getConstant(32, + DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); + Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); + Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); + } else { + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); - // Replace each use (extract) with a load of the appropriate element. - for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), - UE = Uses.end(); UI != UE; ++UI) { - SDNode *Extract = *UI; + EVT ElementType = InputVector.getValueType().getVectorElementType(); + unsigned EltSize = ElementType.getSizeInBits() / 8; - // cOMpute the element's address. - SDValue Idx = Extract->getOperand(1); - unsigned EltSize = - InputVector.getValueType().getVectorElementType().getSizeInBits()/8; - uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + // Replace each use (extract) with a load of the appropriate element. + for (unsigned i = 0; i < 4; ++i) { + uint64_t Offset = EltSize * i; + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); + + // Load the scalar. + Vals[i] = DAG.getLoad(ElementType, dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); + + } + } - // Load the scalar. - SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, - ScalarAddr, MachinePointerInfo(), - false, false, false, 0); + // Replace the extracts + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; - // Replace the exact with the load. - DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + SDValue Idx = Extract->getOperand(1); + uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. @@ -19137,6 +22987,21 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { default: return std::make_pair(0, false); + case MVT::v4i64: + case MVT::v2i64: + if (!Subtarget->hasVLX()) + return std::make_pair(0, false); + break; + case MVT::v64i8: + case MVT::v32i16: + if (!Subtarget->hasBWI()) + return std::make_pair(0, false); + break; + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget->hasAVX512()) + return std::make_pair(0, false); + break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: @@ -19203,7 +23068,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, } static SDValue -TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, +transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); @@ -19216,25 +23081,21 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, Cond = CondSrc->getOperand(0); } - MVT VT = N->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) return SDValue(); + MVT VT = N->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. @@ -19244,6 +23105,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } @@ -19264,8 +23128,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). + // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && TLI.isTypeLegal(VT) && + VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -19408,13 +23273,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper - // lowering on AVX-512. In this case we convert it to + // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. - // The same situation for all 128 and 256-bit vectors of i8 and i16 + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || - OpVT.getVectorElementType() == MVT::i16)) { + OpVT.getVectorElementType() == MVT::i16) && + !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); @@ -19634,22 +23501,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opc, DL, VT, LHS, RHS); } - // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && - // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT && - // Check that condition value type matches vselect operand type - CondVT == VT) { - + // Simplify vector selection if condition value type matches vselect + // operand type + if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!TValIsAllOnes && !FValIsAllZeros) { - // Try invert the condition if true value is not all 1s and false value - // is not all 0s. + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + if (!TValIsAllOnes && !FValIsAllZeros && + // Check if the selector will be produced by CMPP*/PCMP* + Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); @@ -19681,81 +23548,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to fold this VSELECT into a MOVSS/MOVSD - if (N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { - if (VT == MVT::v4i32 || VT == MVT::v4f32 || - (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { - bool CanFold = false; - unsigned NumElems = Cond.getNumOperands(); - SDValue A = LHS; - SDValue B = RHS; - - if (isZero(Cond.getOperand(0))) { - CanFold = true; - - // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) - // fold (vselect <0,-1> -> (movsd A, B) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isAllOnes(Cond.getOperand(i)); - } else if (isAllOnes(Cond.getOperand(0))) { - CanFold = true; - std::swap(A, B); - - // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) - // fold (vselect <-1,0> -> (movsd B, A) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isZero(Cond.getOperand(i)); - } - - if (CanFold) { - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); - return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); - } - - if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { - // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), - // (v2i64 (bitcast B))))) - // - // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), - // (v2f64 (bitcast B))))) - // - // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), - // (v2i64 (bitcast A))))) - // - // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), - // (v2f64 (bitcast A))))) - - CanFold = (isZero(Cond.getOperand(0)) && - isZero(Cond.getOperand(1)) && - isAllOnes(Cond.getOperand(2)) && - isAllOnes(Cond.getOperand(3))); - - if (!CanFold && isAllOnes(Cond.getOperand(0)) && - isAllOnes(Cond.getOperand(1)) && - isZero(Cond.getOperand(2)) && - isZero(Cond.getOperand(3))) { - CanFold = true; - std::swap(LHS, RHS); - } - - if (CanFold) { - EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; - SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); - SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); - SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, - NewB, DAG); - return DAG.getNode(ISD::BITCAST, DL, VT, Select); - } - } - } - } - // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -19767,22 +23559,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16)) { + VT != MVT::v8i16) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are set - // properly. - for (SDNode::use_iterator I = Cond->use_begin(), - E = Cond->use_end(); I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. - return SDValue(); - assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -19790,8 +23577,45 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO)) { + // If we changed the computation somewhere in the DAG, this change + // will affect all users of Cond. + // Make sure it is fine and update all the nodes so that we do not + // use the generic VSELECT anymore. Otherwise, we may perform + // wrong optimizations as we messed up with the actual expectation + // for the vector boolean values. + if (Cond != TLO.Old) { + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are + // set properly. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + + // Update all the users of the condition, before committing the change, + // so that the VSELECT optimizations that expect the correct vector + // boolean value will not be triggered. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + DAG.ReplaceAllUsesOfValueWith( + SDValue(*I, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), + Cond, I->getOperand(1), I->getOperand(2))); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(); + } + // At this point, only Cond is changed. Change the condition + // just for N to keep the opportunity to optimize all other + // users their own way. + DAG.ReplaceAllUsesOfValueWith( + SDValue(N, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), + TLO.New, N->getOperand(1), N->getOperand(2))); + return SDValue(); + } } // We should generate an X86ISD::BLENDI from a vselect if its argument @@ -19805,8 +23629,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { - SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize()) { + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; } @@ -20163,7 +23989,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, // fold (blend A, B, allOnes) -> B if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Op1; - + // Simplify the case where the mask is a constant i32 value. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { if (C->isNullValue()) @@ -20871,13 +24697,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned RegSz = RegVT.getSizeInBits(); - // On Sandybridge unaligned 256bit loads are inefficient. + // For chips with slow 32-byte unaligned loads, break the 32-byte operation + // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) @@ -20907,153 +24733,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, NewVec, TF, true); } - // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the - // expansion is still better than scalar code. - // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll - // emit a shuffle and a arithmetic shift. - // TODO: It is possible to support ZExt by zeroing the undef values - // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && - (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { - assert(MemVT != RegVT && "Cannot extend to the same type"); - assert(MemVT.isVector() && "Must load a vector from memory"); - - unsigned NumElems = RegVT.getVectorNumElements(); - unsigned MemSz = MemVT.getSizeInBits(); - assert(RegSz > MemSz && "Register size must be greater than the mem size"); - - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) - return SDValue(); - - // All sizes must be a power of two. - if (!isPowerOf2_32(RegSz * MemSz * NumElems)) - return SDValue(); - - // Attempt to load the original value using scalar loads. - // Find the largest scalar type that divides the total loaded size. - MVT SclrLoadTy = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { - SclrLoadTy = Tp; - } - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && - (64 <= MemSz)) - SclrLoadTy = MVT::f64; - - // Calculate the number of scalar loads that we need to perform - // in order to load our vector from memory. - unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); - if (Ext == ISD::SEXTLOAD && NumLoads > 1) - return SDValue(); - - unsigned loadRegZize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz == 256) - loadRegZize /= 2; - - // Represent our vector as a sequence of elements which are the - // largest scalar that we can load. - EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, - loadRegZize/SclrLoadTy.getSizeInBits()); - - // Represent the data using the same element type that is stored in - // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize/MemVT.getScalarType().getSizeInBits()); - - assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && - "Invalid vector type"); - - // We can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) - return SDValue(); - - SmallVector<SDValue, 8> Chains; - SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, - TLI.getPointerTy()); - SDValue Res = DAG.getUNDEF(LoadUnitVecVT); - - for (unsigned i = 0; i < NumLoads; ++i) { - // Perform a single load. - SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), - Ptr, Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); - Chains.push_back(ScalarLoad.getValue(1)); - // Create the first element type using SCALAR_TO_VECTOR in order to avoid - // another round of DAGCombining. - if (i == 0) - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); - else - Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i)); - - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); - - // Bitcast the loaded value to a vector of the original element type, in - // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); - unsigned SizeRatio = RegSz/MemSz; - - if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1 we can directly emit a VSEXT node. - if (Subtarget->hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); - return DCI.CombineTo(N, Sext, TF, true); - } - - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) - return SDValue(); - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio + SizeRatio-1] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); - - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); - - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, RegVT)); - - return DCI.CombineTo(N, Shuff, TF, true); - } - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); - - // Bitcast to the requested type. - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); - // Replace the original load with the new sequence - // and return the new chain. - return DCI.CombineTo(N, Shuff, TF, true); - } - return SDValue(); } @@ -21067,13 +24746,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatenation of two XMM registers, perform two stores. - // On Sandy Bridge, 256-bit memory operations are executed by two - // 128-bit ports. However, on Haswell it is better to issue a single 256-bit - // memory operation. + // If we are saving a concatenation of two XMM registers and 32-byte stores + // are slow, such as on Sandy Bridge, perform two 16-byte stores. unsigned Alignment = St->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && !Subtarget->hasInt256() && + if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) @@ -21139,9 +24816,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Find the largest store unit MVT StoreType = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; + for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } @@ -21287,7 +24962,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, @@ -21413,7 +25088,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return true; } -/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +/// Do target-specific dag combines on floating point adds. static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -21428,7 +25103,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +/// Do target-specific dag combines on floating point subs. static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -21443,8 +25118,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and -/// X86ISD::FXOR nodes. +/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -21458,8 +25132,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and -/// X86ISD::FMAX nodes. +/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); @@ -21480,7 +25153,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } -/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. +/// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 // FAND(x, 0.0) -> 0.0 @@ -21493,7 +25166,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +/// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x @@ -21576,13 +25249,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> + // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) + // This exposes the sext to the sdivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && + N0.getValueType() == MVT::i8 && VT == MVT::i32) { + SDLoc dl(N); + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) @@ -21675,6 +25364,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return R; } + // (i8,i32 zext (udivrem (i8 x, i8 y)) -> + // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) + // This exposes the zext to the udivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::UDIVREM && + N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && + (VT == MVT::i32 || VT == MVT::i64)) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + return SDValue(); } @@ -21863,14 +25566,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); - // Now check that the other operand of the AND is a constant splat. We could + // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { - // Bail out if the vector isn't a constant splat. - if (!BV->getConstantSplatNode()) + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. @@ -22044,18 +25747,68 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + // (vzext (bitcast (vzext (x)) -> (vzext x) - SDValue In = N->getOperand(0); - while (In.getOpcode() == ISD::BITCAST) - In = In.getOperand(0); + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - if (In.getOpcode() != X86ISD::VZEXT) - return SDValue(); + if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + MVT InnerVT = V.getSimpleValueType(); + MVT InnerEltVT = InnerVT.getVectorElementType(); - return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), - In.getOperand(0)); + // If the element sizes match exactly, we can just do one larger vzext. This + // is always an exact type match as vzext operates on integer types. + if (OpEltVT == InnerEltVT) { + assert(OpVT == InnerVT && "Types must match for vzext!"); + return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); + } + + // The only other way we can combine them is if only a single element of the + // inner vzext is used in the input to the outer vzext. + if (InnerEltVT.getSizeInBits() < InputBits) + return SDValue(); + + // In this case, the inner vzext is completely dead because we're going to + // only look at bits inside of the low element. Just do the outer vzext on + // a bitcast of the input to the inner. + return DAG.getNode(X86ISD::VZEXT, DL, VT, + DAG.getNode(ISD::BITCAST, DL, OpVT, V)); + } + + // Check if we can bypass extracting and re-inserting an element of an input + // vector. Essentialy: + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { + SDValue ExtractedV = V.getOperand(0); + SDValue OrigV = ExtractedV.getOperand(0); + if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) + if (ExtractIdx->getZExtValue() == 0) { + MVT OrigVT = OrigV.getSimpleValueType(); + // Extract a subvector if necessary... + if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { + int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); + OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), + OrigVT.getVectorNumElements() / Ratio); + OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, + DAG.getIntPtrConstant(0)); + } + Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); + return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); + } + } + + return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -22066,7 +25819,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::SELECT: + case X86ISD::SHRUNKBLEND: + return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -22107,12 +25862,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); @@ -22545,6 +26301,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'L': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || + (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { + Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); + break; + } + } + return; + case 'M': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 3) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'N': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 255) { @@ -22553,6 +26326,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'O': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 127) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -22762,14 +26543,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Constraint[5] == ')' && Constraint[6] == '}') { - Res.first = X86::ST0+Constraint[4]-'0'; + Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { - Res.first = X86::ST0; + Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } @@ -22897,7 +26678,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. - // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(AM, Ty)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. |