diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 12158 |
1 files changed, 7562 insertions, 4596 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dd9966f9e1791..e547111959008 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -71,9 +71,10 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) - : TargetLowering(TM), Subtarget(&STI) { - X86ScalarSSEf64 = Subtarget->hasSSE2(); - X86ScalarSSEf32 = Subtarget->hasSSE1(); + : TargetLowering(TM), Subtarget(STI) { + bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); + X86ScalarSSEf64 = Subtarget.hasSSE2(); + X86ScalarSSEf32 = Subtarget.hasSSE1(); MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. @@ -86,24 +87,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. - if (Subtarget->isAtom()) + if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); - else if (Subtarget->is64Bit()) + else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2. if (TM.getOptLevel() >= CodeGenOpt::Default) { - if (Subtarget->hasSlowDivide32()) + if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); - if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) + if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 16); } - if (Subtarget->isTargetKnownWindowsMSVC()) { + if (Subtarget.isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); @@ -117,11 +118,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } - if (Subtarget->isTargetDarwin()) { + if (Subtarget.isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); - } else if (Subtarget->isTargetWindowsGNU()) { + } else if (Subtarget.isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); @@ -134,7 +135,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); - if (Subtarget->is64Bit()) + if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) @@ -164,14 +165,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); - if (Subtarget->is64Bit()) { - if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) + if (Subtarget.is64Bit()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) // f32/f64 are legal, f80 is custom. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); else setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - } else if (!Subtarget->useSoftFloat()) { + } else if (!Subtarget.useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -185,8 +186,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!Subtarget->useSoftFloat()) { - // SSE has no i16 to fp conversion, only i32 + if (!Subtarget.useSoftFloat()) { + // SSE has no i16 to fp conversion, only i32. if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not @@ -205,7 +206,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - if (!Subtarget->useSoftFloat()) { + if (!Subtarget.useSoftFloat()) { // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); @@ -231,8 +232,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); - if (Subtarget->is64Bit()) { - if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + if (Subtarget.is64Bit()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); @@ -240,9 +241,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); } - } else if (!Subtarget->useSoftFloat()) { + } else if (!Subtarget.useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. - if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) + if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. @@ -260,12 +261,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } else if (!Subtarget->is64Bit()) + } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that @@ -295,72 +296,43 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); - setOperationAction(ISD::BR_CC , MVT::f32, Expand); - setOperationAction(ISD::BR_CC , MVT::f64, Expand); - setOperationAction(ISD::BR_CC , MVT::f80, Expand); - setOperationAction(ISD::BR_CC , MVT::f128, Expand); - setOperationAction(ISD::BR_CC , MVT::i8, Expand); - setOperationAction(ISD::BR_CC , MVT::i16, Expand); - setOperationAction(ISD::BR_CC , MVT::i32, Expand); - setOperationAction(ISD::BR_CC , MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); - setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); - setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); - setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); - setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); - setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); - if (Subtarget->is64Bit()) + for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, + MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + setOperationAction(ISD::BR_CC, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + } + if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); - if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { - // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` - // is. We should promote the value to 64-bits to solve this. - // This is what the CRT headers do - `fmodf` is an inline header - // function casting to f64 and calling `fmod`. - setOperationAction(ISD::FREM , MVT::f32 , Promote); - } else { - setOperationAction(ISD::FREM , MVT::f32 , Expand); - } - + setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. - setOperationAction(ISD::CTTZ , MVT::i8 , Promote); - AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); - setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); - AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); - if (Subtarget->hasBMI()) { - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); - if (Subtarget->is64Bit()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - } else { + setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); - if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); + if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); + } } - if (Subtarget->hasLZCNT()) { + if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. - setOperationAction(ISD::CTLZ , MVT::i8 , Promote); - AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); - AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); - if (Subtarget->is64Bit()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); @@ -368,7 +340,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } @@ -377,7 +349,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { + if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -395,45 +367,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); - if (Subtarget->hasPOPCNT()) { + if (Subtarget.hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); - if (Subtarget->is64Bit()) + if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); - if (!Subtarget->hasMOVBE()) + if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. - setOperationAction(ISD::SELECT , MVT::i8 , Custom); - setOperationAction(ISD::SELECT , MVT::i16 , Custom); - setOperationAction(ISD::SELECT , MVT::i32 , Custom); - setOperationAction(ISD::SELECT , MVT::f32 , Custom); - setOperationAction(ISD::SELECT , MVT::f64 , Custom); - setOperationAction(ISD::SELECT , MVT::f80 , Custom); - setOperationAction(ISD::SELECT , MVT::f128 , Custom); - setOperationAction(ISD::SETCC , MVT::i8 , Custom); - setOperationAction(ISD::SETCC , MVT::i16 , Custom); - setOperationAction(ISD::SETCC , MVT::i32 , Custom); - setOperationAction(ISD::SETCC , MVT::f32 , Custom); - setOperationAction(ISD::SETCC , MVT::f64 , Custom); - setOperationAction(ISD::SETCC , MVT::f80 , Custom); - setOperationAction(ISD::SETCC , MVT::f128 , Custom); - setOperationAction(ISD::SETCCE , MVT::i8 , Custom); - setOperationAction(ISD::SETCCE , MVT::i16 , Custom); - setOperationAction(ISD::SETCCE , MVT::i32 , Custom); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::SELECT , MVT::i64 , Custom); - setOperationAction(ISD::SETCC , MVT::i64 , Custom); - setOperationAction(ISD::SETCCE , MVT::i64 , Custom); + for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + } + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget.is64Bit()) + continue; + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SETCCE, VT, Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -444,34 +405,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) + setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); // Darwin ABI issue. - setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); - setOperationAction(ISD::JumpTable , MVT::i32 , Custom); - setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); - setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); - if (Subtarget->is64Bit()) - setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); - setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); - setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); - setOperationAction(ISD::JumpTable , MVT::i64 , Custom); - setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); - setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); - setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); + for (auto VT : { MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget.is64Bit()) + continue; + setOperationAction(ISD::ConstantPool , VT, Custom); + setOperationAction(ISD::JumpTable , VT, Custom); + setOperationAction(ISD::GlobalAddress , VT, Custom); + setOperationAction(ISD::GlobalTLSAddress, VT, Custom); + setOperationAction(ISD::ExternalSymbol , VT, Custom); + setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) - setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); - setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); - setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); - setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); - setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); + for (auto VT : { MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget.is64Bit()) + continue; + setOperationAction(ISD::SHL_PARTS, VT, Custom); + setOperationAction(ISD::SRA_PARTS, VT, Custom); + setOperationAction(ISD::SRL_PARTS, VT, Custom); } - if (Subtarget->hasSSE1()) + if (Subtarget.hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); @@ -480,16 +438,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } - if (Subtarget->hasCmpxchg16b()) { + if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags - if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && - !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { + if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && + !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && + TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } @@ -505,14 +468,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::VAARG , MVT::Other, Custom); - setOperationAction(ISD::VACOPY , MVT::Other, Custom); - } else { - // TargetInfo::CharPtrBuiltinVaList - setOperationAction(ISD::VAARG , MVT::Other, Expand); - setOperationAction(ISD::VACOPY , MVT::Other, Expand); - } + bool Is64Bit = Subtarget.is64Bit(); + setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); @@ -523,41 +481,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); - if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { + if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::FR64RegClass); - // Use ANDPD to simulate FABS. - setOperationAction(ISD::FABS , MVT::f64, Custom); - setOperationAction(ISD::FABS , MVT::f32, Custom); + for (auto VT : { MVT::f32, MVT::f64 }) { + // Use ANDPD to simulate FABS. + setOperationAction(ISD::FABS, VT, Custom); - // Use XORP to simulate FNEG. - setOperationAction(ISD::FNEG , MVT::f64, Custom); - setOperationAction(ISD::FNEG , MVT::f32, Custom); + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG, VT, Custom); - // Use ANDPD and ORPD to simulate FCOPYSIGN. - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // Use ANDPD and ORPD to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , VT, Expand); + setOperationAction(ISD::FCOS , VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + } - // Lower this to FGETSIGNx86 plus an AND. + // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); - // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { + } else if (UseX87 && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); @@ -592,24 +546,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } - } else if (!Subtarget->useSoftFloat()) { + } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); - setOperationAction(ISD::UNDEF, MVT::f64, Expand); - setOperationAction(ISD::UNDEF, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + for (auto VT : { MVT::f32, MVT::f64 }) { + setOperationAction(ISD::UNDEF, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); - if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , VT, Expand); + setOperationAction(ISD::FCOS , VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + } } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 @@ -626,8 +577,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. - if (!Subtarget->useSoftFloat()) { - if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + if (UseX87) { + if (Subtarget.is64Bit() && Subtarget.hasMMX()) { addRegisterClass(MVT::f128, &X86::FR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); @@ -680,38 +631,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); + // Some FP actions are always expanded for vector types. + for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, + MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + } + // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { - setOperationAction(ISD::ADD , VT, Expand); - setOperationAction(ISD::SUB , VT, Expand); - setOperationAction(ISD::FADD, VT, Expand); - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::FSUB, VT, Expand); - setOperationAction(ISD::MUL , VT, Expand); - setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); - setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::LOAD, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSINCOS, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FSINCOS, VT, Expand); - setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); - setOperationAction(ISD::FSQRT, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -723,24 +672,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SRA, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); - setOperationAction(ISD::FLOG, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FLOG10, VT, Expand); - setOperationAction(ISD::FEXP, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); @@ -750,7 +688,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); @@ -774,35 +711,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } - // MMX-sized vectors (other than x86mmx) are expected to be expanded - // into smaller operations. - for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { - setOperationAction(ISD::MULHS, MMXTy, Expand); - setOperationAction(ISD::AND, MMXTy, Expand); - setOperationAction(ISD::OR, MMXTy, Expand); - setOperationAction(ISD::XOR, MMXTy, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); - setOperationAction(ISD::SELECT, MMXTy, Expand); - setOperationAction(ISD::BITCAST, MMXTy, Expand); - } - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); - - if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); - setOperationAction(ISD::FADD, MVT::v4f32, Legal); - setOperationAction(ISD::FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); - setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); @@ -811,7 +729,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } - if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM @@ -821,27 +739,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v4i32, &X86::VR128RegClass); addRegisterClass(MVT::v2i64, &X86::VR128RegClass); - setOperationAction(ISD::ADD, MVT::v16i8, Legal); - setOperationAction(ISD::ADD, MVT::v8i16, Legal); - setOperationAction(ISD::ADD, MVT::v4i32, Legal); - setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i8, Custom); + setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); - setOperationAction(ISD::SUB, MVT::v16i8, Legal); - setOperationAction(ISD::SUB, MVT::v8i16, Legal); - setOperationAction(ISD::SUB, MVT::v4i32, Legal); - setOperationAction(ISD::SUB, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); - setOperationAction(ISD::FADD, MVT::v2f64, Legal); - setOperationAction(ISD::FSUB, MVT::v2f64, Legal); - setOperationAction(ISD::FMUL, MVT::v2f64, Legal); - setOperationAction(ISD::FDIV, MVT::v2f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); @@ -870,10 +777,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); // ISD::CTTZ v2i64 - scalarization is faster. - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); - // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { @@ -899,37 +802,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + for (auto VT : { MVT::v2f64, MVT::v2i64 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + + if (VT == MVT::v2i64 && !Subtarget.is64Bit()) + continue; - if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v2i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v2i64); - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v2i64); - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); - setOperationAction(ISD::SELECT, VT, Promote); - AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); + setOperationPromotedToType(ISD::AND, VT, MVT::v2i64); + setOperationPromotedToType(ISD::OR, VT, MVT::v2i64); + setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64); + setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64); + setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64); } // Custom lower v2i64 and v2f64 selects. - setOperationAction(ISD::LOAD, MVT::v2f64, Legal); - setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); @@ -942,7 +836,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom // sequence to convert from v2i32 to v2f32. - if (!Subtarget->is64Bit()) + if (!Subtarget.is64Bit()) setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); @@ -954,9 +848,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + + for (auto VT : { MVT::v8i16, MVT::v16i8 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + } + + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + for (auto VT : { MVT::v4i32, MVT::v2i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + } + } + + if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { + setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + // ISD::CTLZ v4i32 - scalarization is faster. + // ISD::CTLZ v2i64 - scalarization is faster. } - if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); @@ -1004,66 +924,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); - // i8 and i16 vectors are custom because the source register and source - // source memory operand types are not the same width. f32 vectors are - // custom since the immediate controlling the insert encodes additional - // information. + // i8 vectors are custom because the source register and source + // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - - // FIXME: these should be Legal, but that's only for the case where - // the index is constant. For now custom expand to deal with that. - if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); - } } - if (Subtarget->hasSSE2()) { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - - setOperationAction(ISD::SRL, MVT::v8i16, Custom); - setOperationAction(ISD::SRL, MVT::v16i8, Custom); + if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, + MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::SHL, MVT::v8i16, Custom); - setOperationAction(ISD::SHL, MVT::v16i8, Custom); - - setOperationAction(ISD::SRA, MVT::v8i16, Custom); - setOperationAction(ISD::SRA, MVT::v16i8, Custom); - - // In the customized shift lowering, the legal cases in AVX2 will be - // recognized. - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); + // XOP can efficiently perform BITREVERSE with VPPERM. + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) + setOperationAction(ISD::BITREVERSE, VT, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); - - setOperationAction(ISD::SRA, MVT::v2i64, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, + MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::BITREVERSE, VT, Custom); } - if (Subtarget->hasXOP()) { - setOperationAction(ISD::ROTL, MVT::v16i8, Custom); - setOperationAction(ISD::ROTL, MVT::v8i16, Custom); - setOperationAction(ISD::ROTL, MVT::v4i32, Custom); - setOperationAction(ISD::ROTL, MVT::v2i64, Custom); - setOperationAction(ISD::ROTL, MVT::v32i8, Custom); - setOperationAction(ISD::ROTL, MVT::v16i16, Custom); - setOperationAction(ISD::ROTL, MVT::v8i32, Custom); - setOperationAction(ISD::ROTL, MVT::v4i64, Custom); - } + if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { + bool HasInt256 = Subtarget.hasInt256(); - if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); @@ -1071,35 +953,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v4i64, &X86::VR256RegClass); addRegisterClass(MVT::v4f64, &X86::VR256RegClass); - setOperationAction(ISD::LOAD, MVT::v8f32, Legal); - setOperationAction(ISD::LOAD, MVT::v4f64, Legal); - setOperationAction(ISD::LOAD, MVT::v4i64, Legal); - - setOperationAction(ISD::FADD, MVT::v8f32, Legal); - setOperationAction(ISD::FSUB, MVT::v8f32, Legal); - setOperationAction(ISD::FMUL, MVT::v8f32, Legal); - setOperationAction(ISD::FDIV, MVT::v8f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); - setOperationAction(ISD::FRINT, MVT::v8f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); - setOperationAction(ISD::FNEG, MVT::v8f32, Custom); - setOperationAction(ISD::FABS, MVT::v8f32, Custom); - - setOperationAction(ISD::FADD, MVT::v4f64, Legal); - setOperationAction(ISD::FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); - setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); - setOperationAction(ISD::FRINT, MVT::v4f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); - setOperationAction(ISD::FNEG, MVT::v4f64, Custom); - setOperationAction(ISD::FABS, MVT::v4f64, Custom); + for (auto VT : { MVT::v8f32, MVT::v4f64 }) { + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. @@ -1117,14 +979,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); - setOperationAction(ISD::SRL, MVT::v16i16, Custom); - setOperationAction(ISD::SRL, MVT::v32i8, Custom); - - setOperationAction(ISD::SHL, MVT::v16i16, Custom); - setOperationAction(ISD::SHL, MVT::v32i8, Custom); - - setOperationAction(ISD::SRA, MVT::v16i16, Custom); - setOperationAction(ISD::SRA, MVT::v32i8, Custom); + for (auto VT : { MVT::v32i8, MVT::v16i16 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + } setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); @@ -1147,63 +1006,57 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); + + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + } + + // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2 + // as we end up splitting the 256-bit vectors. + for (auto VT : { MVT::v32i8, MVT::v16i16 }) + setOperationAction(ISD::CTLZ, VT, Custom); + + if (HasInt256) + for (auto VT : { MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); - setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - - setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); - setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); - setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - - if (Subtarget->hasAnyFMA()) { - setOperationAction(ISD::FMA, MVT::v8f32, Legal); - setOperationAction(ISD::FMA, MVT::v4f64, Legal); - setOperationAction(ISD::FMA, MVT::v4f32, Legal); - setOperationAction(ISD::FMA, MVT::v2f64, Legal); - setOperationAction(ISD::FMA, MVT::f32, Legal); - setOperationAction(ISD::FMA, MVT::f64, Legal); - } - - if (Subtarget->hasInt256()) { - setOperationAction(ISD::ADD, MVT::v4i64, Legal); - setOperationAction(ISD::ADD, MVT::v8i32, Legal); - setOperationAction(ISD::ADD, MVT::v16i16, Legal); - setOperationAction(ISD::ADD, MVT::v32i8, Legal); - - setOperationAction(ISD::SUB, MVT::v4i64, Legal); - setOperationAction(ISD::SUB, MVT::v8i32, Legal); - setOperationAction(ISD::SUB, MVT::v16i16, Legal); - setOperationAction(ISD::SUB, MVT::v32i8, Legal); - - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, Legal); - setOperationAction(ISD::MUL, MVT::v16i16, Legal); - setOperationAction(ISD::MUL, MVT::v32i8, Custom); - - setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); - setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); - setOperationAction(ISD::MULHU, MVT::v16i16, Legal); - setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - - setOperationAction(ISD::SMAX, MVT::v32i8, Legal); - setOperationAction(ISD::SMAX, MVT::v16i16, Legal); - setOperationAction(ISD::SMAX, MVT::v8i32, Legal); - setOperationAction(ISD::UMAX, MVT::v32i8, Legal); - setOperationAction(ISD::UMAX, MVT::v16i16, Legal); - setOperationAction(ISD::UMAX, MVT::v8i32, Legal); - setOperationAction(ISD::SMIN, MVT::v32i8, Legal); - setOperationAction(ISD::SMIN, MVT::v16i16, Legal); - setOperationAction(ISD::SMIN, MVT::v8i32, Legal); - setOperationAction(ISD::UMIN, MVT::v32i8, Legal); - setOperationAction(ISD::UMIN, MVT::v16i16, Legal); - setOperationAction(ISD::UMIN, MVT::v8i32, Legal); + if (Subtarget.hasAnyFMA()) { + for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, + MVT::v2f64, MVT::v4f64 }) + setOperationAction(ISD::FMA, VT, Legal); + } + + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); + } + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); + + setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MULHU, MVT::v32i8, Custom); + setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { + setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); + } + + if (HasInt256) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom); // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. @@ -1223,62 +1076,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); - } else { - setOperationAction(ISD::ADD, MVT::v4i64, Custom); - setOperationAction(ISD::ADD, MVT::v8i32, Custom); - setOperationAction(ISD::ADD, MVT::v16i16, Custom); - setOperationAction(ISD::ADD, MVT::v32i8, Custom); - - setOperationAction(ISD::SUB, MVT::v4i64, Custom); - setOperationAction(ISD::SUB, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v16i16, Custom); - setOperationAction(ISD::SUB, MVT::v32i8, Custom); - - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v16i16, Custom); - setOperationAction(ISD::MUL, MVT::v32i8, Custom); - - setOperationAction(ISD::SMAX, MVT::v32i8, Custom); - setOperationAction(ISD::SMAX, MVT::v16i16, Custom); - setOperationAction(ISD::SMAX, MVT::v8i32, Custom); - setOperationAction(ISD::UMAX, MVT::v32i8, Custom); - setOperationAction(ISD::UMAX, MVT::v16i16, Custom); - setOperationAction(ISD::UMAX, MVT::v8i32, Custom); - setOperationAction(ISD::SMIN, MVT::v32i8, Custom); - setOperationAction(ISD::SMIN, MVT::v16i16, Custom); - setOperationAction(ISD::SMIN, MVT::v8i32, Custom); - setOperationAction(ISD::UMIN, MVT::v32i8, Custom); - setOperationAction(ISD::UMIN, MVT::v16i16, Custom); - setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be // recognized. - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); + for (auto VT : { MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + } - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } - setOperationAction(ISD::SRA, MVT::v4i64, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); + // Extract subvector is special because the value type + // (result) is 128-bit but the source is 256-bit wide. + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, + MVT::v4f32, MVT::v2f64 }) { + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } // Custom lower several nodes for 256-bit types. - for (MVT VT : MVT::vector_valuetypes()) { - if (VT.getScalarSizeInBits() >= 32) { - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - } - // Extract subvector is special because the value type - // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) { - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - } - // Do not attempt to custom lower other non-256-bit vectors - if (!VT.is256BitVector()) - continue; - + for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, + MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1289,25 +1112,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } - if (Subtarget->hasInt256()) + if (HasInt256) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v4i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v4i64); - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v4i64); - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); - setOperationAction(ISD::SELECT, VT, Promote); - AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); + setOperationPromotedToType(ISD::AND, VT, MVT::v4i64); + setOperationPromotedToType(ISD::OR, VT, MVT::v4i64); + setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64); + setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); + setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64); } } - if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); @@ -1320,19 +1138,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); - + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); + } setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::SETCCE, MVT::i1, Custom); @@ -1343,29 +1156,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::i1, Custom); setOperationAction(ISD::ADD, MVT::i1, Custom); setOperationAction(ISD::MUL, MVT::i1, Custom); - setOperationAction(ISD::LOAD, MVT::v16f32, Legal); - setOperationAction(ISD::LOAD, MVT::v8f64, Legal); - setOperationAction(ISD::LOAD, MVT::v8i64, Legal); - setOperationAction(ISD::LOAD, MVT::v16i32, Legal); - setOperationAction(ISD::LOAD, MVT::v16i1, Legal); - - setOperationAction(ISD::FADD, MVT::v16f32, Legal); - setOperationAction(ISD::FSUB, MVT::v16f32, Legal); - setOperationAction(ISD::FMUL, MVT::v16f32, Legal); - setOperationAction(ISD::FDIV, MVT::v16f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); - setOperationAction(ISD::FNEG, MVT::v16f32, Custom); - setOperationAction(ISD::FABS, MVT::v16f32, Custom); - - setOperationAction(ISD::FADD, MVT::v8f64, Legal); - setOperationAction(ISD::FSUB, MVT::v8f64, Legal); - setOperationAction(ISD::FMUL, MVT::v8f64, Legal); - setOperationAction(ISD::FDIV, MVT::v8f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); - setOperationAction(ISD::FNEG, MVT::v8f64, Custom); - setOperationAction(ISD::FABS, MVT::v8f64, Custom); - setOperationAction(ISD::FMA, MVT::v8f64, Legal); - setOperationAction(ISD::FMA, MVT::v16f32, Legal); + + for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, + MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, + MVT::v8i64, MVT::v32i16, MVT::v64i8}) { + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom); + setTruncStoreAction(VT, MaskVT, Custom); + } + + for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FMA, VT, Legal); + } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); @@ -1389,7 +1195,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); - if (Subtarget->hasVLX()){ + if (Subtarget.hasVLX()){ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); @@ -1412,15 +1218,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); - if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); - + setOperationAction(ISD::VSELECT, MVT::v8i1, Expand); + setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); + if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); - if (Subtarget->hasVLX()) { + if (Subtarget.hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); @@ -1431,7 +1236,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); } } - if (Subtarget->hasVLX()) { + if (Subtarget.hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1440,7 +1245,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); + + // FIXME. This commands are available on SSE/AVX2, add relevant patterns. + setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); } + setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); @@ -1453,20 +1273,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - if (Subtarget->hasDQI()) { + if (Subtarget.hasDQI()) { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); } - setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); - setOperationAction(ISD::FRINT, MVT::v16f32, Legal); - setOperationAction(ISD::FRINT, MVT::v8f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + } setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); @@ -1501,139 +1318,115 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v16i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i64, Legal); - setOperationAction(ISD::ADD, MVT::v8i64, Legal); - setOperationAction(ISD::ADD, MVT::v16i32, Legal); - - setOperationAction(ISD::SUB, MVT::v8i64, Legal); - setOperationAction(ISD::SUB, MVT::v16i32, Legal); + setOperationAction(ISD::ADD, MVT::v8i1, Expand); + setOperationAction(ISD::ADD, MVT::v16i1, Expand); + setOperationAction(ISD::SUB, MVT::v8i1, Expand); + setOperationAction(ISD::SUB, MVT::v16i1, Expand); + setOperationAction(ISD::MUL, MVT::v8i1, Expand); + setOperationAction(ISD::MUL, MVT::v16i1, Expand); setOperationAction(ISD::MUL, MVT::v16i32, Legal); - setOperationAction(ISD::SRL, MVT::v8i64, Custom); - setOperationAction(ISD::SRL, MVT::v16i32, Custom); - - setOperationAction(ISD::SHL, MVT::v8i64, Custom); - setOperationAction(ISD::SHL, MVT::v16i32, Custom); - - setOperationAction(ISD::SRA, MVT::v8i64, Custom); - setOperationAction(ISD::SRA, MVT::v16i32, Custom); - - setOperationAction(ISD::AND, MVT::v8i64, Legal); - setOperationAction(ISD::OR, MVT::v8i64, Legal); - setOperationAction(ISD::XOR, MVT::v8i64, Legal); - setOperationAction(ISD::AND, MVT::v16i32, Legal); - setOperationAction(ISD::OR, MVT::v16i32, Legal); - setOperationAction(ISD::XOR, MVT::v16i32, Legal); + for (auto VT : { MVT::v16i32, MVT::v8i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + } - if (Subtarget->hasCDI()) { + if (Subtarget.hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); - if (Subtarget->hasVLX()) { + if (Subtarget.hasVLX()) { setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); } else { setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); } - } // Subtarget->hasCDI() - if (Subtarget->hasDQI()) { - setOperationAction(ISD::MUL, MVT::v2i64, Legal); - setOperationAction(ISD::MUL, MVT::v4i64, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } // Subtarget.hasCDI() + + if (Subtarget.hasDQI()) { + if (Subtarget.hasVLX()) { + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); + } setOperationAction(ISD::MUL, MVT::v8i64, Legal); } // Custom lower several nodes. - for (MVT VT : MVT::vector_valuetypes()) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (EltSize == 1) { - setOperationAction(ISD::AND, VT, Legal); - setOperationAction(ISD::OR, VT, Legal); - setOperationAction(ISD::XOR, VT, Legal); - } - if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); - } - // Extract subvector is special because the value type - // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) { - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - } - if (VT.getVectorElementType() == MVT::i1) - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); - - // Do not attempt to custom lower other non-512-bit vectors - if (!VT.is512BitVector()) - continue; - - if (EltSize >= 32) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::MGATHER, VT, Legal); - setOperationAction(ISD::MSCATTER, VT, Custom); - } + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + } + // Extract subvector is special because the value type + // (result) is 256-bit but the source is 512-bit wide. + // 128-bit was made Custom under AVX1. + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, + MVT::v8f32, MVT::v4f64 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, + MVT::v16i1, MVT::v32i1, MVT::v64i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { - setOperationAction(ISD::SELECT, VT, Promote); - AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); + setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 - if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::LOAD, MVT::v32i16, Legal); - setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::ADD, MVT::v32i1, Expand); + setOperationAction(ISD::ADD, MVT::v64i1, Expand); + setOperationAction(ISD::SUB, MVT::v32i1, Expand); + setOperationAction(ISD::SUB, MVT::v64i1, Expand); + setOperationAction(ISD::MUL, MVT::v32i1, Expand); + setOperationAction(ISD::MUL, MVT::v64i1, Expand); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); - setOperationAction(ISD::ADD, MVT::v32i16, Legal); - setOperationAction(ISD::ADD, MVT::v64i8, Legal); - setOperationAction(ISD::SUB, MVT::v32i16, Legal); - setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); @@ -1646,12 +1439,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); @@ -1667,6 +1463,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i1, Expand); + setOperationAction(ISD::VSELECT, MVT::v64i1, Expand); + setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1679,36 +1480,59 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); - if (Subtarget->hasVLX()) + if (Subtarget.hasVLX()) setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - if (Subtarget->hasCDI()) { + LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom; + for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { + setOperationAction(ISD::MLOAD, VT, Action); + setOperationAction(ISD::MSTORE, VT, Action); + } + + if (Subtarget.hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v8i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v8i64); - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v8i64); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + + setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); + setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); + setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); + } + + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); + if (Subtarget.hasVLX()) { + // FIXME. This commands are available on SSE/AVX2, add relevant patterns. + setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal); + } } } - if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + setOperationAction(ISD::ADD, MVT::v2i1, Expand); + setOperationAction(ISD::ADD, MVT::v4i1, Expand); + setOperationAction(ISD::SUB, MVT::v2i1, Expand); + setOperationAction(ISD::SUB, MVT::v4i1, Expand); + setOperationAction(ISD::MUL, MVT::v2i1, Expand); + setOperationAction(ISD::MUL, MVT::v4i1, Expand); + + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); @@ -1721,31 +1545,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i1, Expand); + setOperationAction(ISD::VSELECT, MVT::v4i1, Expand); + + for (auto VT : { MVT::v4i32, MVT::v8i32 }) { + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + } - setOperationAction(ISD::AND, MVT::v8i32, Legal); - setOperationAction(ISD::OR, MVT::v8i32, Legal); - setOperationAction(ISD::XOR, MVT::v8i32, Legal); - setOperationAction(ISD::AND, MVT::v4i32, Legal); - setOperationAction(ISD::OR, MVT::v4i32, Legal); - setOperationAction(ISD::XOR, MVT::v4i32, Legal); - setOperationAction(ISD::SRA, MVT::v2i64, Custom); - setOperationAction(ISD::SRA, MVT::v4i64, Custom); - - setOperationAction(ISD::SMAX, MVT::v2i64, Legal); - setOperationAction(ISD::SMAX, MVT::v4i64, Legal); - setOperationAction(ISD::UMAX, MVT::v2i64, Legal); - setOperationAction(ISD::UMAX, MVT::v4i64, Legal); - setOperationAction(ISD::SMIN, MVT::v2i64, Legal); - setOperationAction(ISD::SMIN, MVT::v4i64, Legal); - setOperationAction(ISD::UMIN, MVT::v2i64, Legal); - setOperationAction(ISD::UMIN, MVT::v4i64, Legal); + for (auto VT : { MVT::v2i64, MVT::v4i64 }) { + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + } } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) { + if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } @@ -1757,7 +1578,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - if (VT == MVT::i64 && !Subtarget->is64Bit()) + if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); @@ -1768,7 +1589,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, VT, Custom); } - if (!Subtarget->is64Bit()) { + if (!Subtarget.is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); @@ -1776,10 +1597,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into one node or libcall if possible. - if (Subtarget->hasSinCos()) { + if (Subtarget.hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->isTargetDarwin()) { + if (Subtarget.isTargetDarwin()) { // For MacOSX, we don't want the normal expansion of a libcall to sincos. // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); @@ -1787,7 +1608,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - if (Subtarget->isTargetWin64()) { + if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); @@ -1796,6 +1617,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC()) + for (ISD::NodeType Op : + {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, + ISD::FLOG10, ISD::FPOW, ISD::FSIN}) + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1827,13 +1659,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); - computeRegisterProperties(Subtarget->getRegisterInfo()); + computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; @@ -1843,9 +1674,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. - // A predictable cmov does not hurt on an in-order CPU. - // FIXME: Use a CPU attribute to trigger this, not a CPU model. - PredictableSelectIsExpensive = !Subtarget->isAtom(); + // An out-of-order CPU can speculatively execute past a predictable branch, + // but a conditional move could be stalled by an expensive earlier operation. + PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. @@ -1854,7 +1685,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { - return Subtarget->isTargetMachO() && Subtarget->is64Bit(); + return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } TargetLoweringBase::LegalizeTypeAction @@ -1867,24 +1698,25 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } -EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, +EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, + LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; + return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); const unsigned NumElts = VVT.getVectorNumElements(); - const MVT EltVT = VVT.getVectorElementType(); + MVT EltVT = VVT.getVectorElementType(); if (VVT.is512BitVector()) { - if (Subtarget->hasAVX512()) + if (Subtarget.hasAVX512()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; } - if (Subtarget->hasBWI()) + if (Subtarget.hasBWI()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 32: return MVT::v32i1; @@ -1892,23 +1724,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, } } - if (VVT.is256BitVector() || VVT.is128BitVector()) { - if (Subtarget->hasVLX()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 2: return MVT::v2i1; - case 4: return MVT::v4i1; - case 8: return MVT::v8i1; - } - if (Subtarget->hasBWI() && Subtarget->hasVLX()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - case 32: return MVT::v32i1; - } + if (Subtarget.hasBWI() && Subtarget.hasVLX()) + return MVT::getVectorVT(MVT::i1, NumElts); + + if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) { + EVT LegalVT = getTypeToTransformTo(Context, VT); + EltVT = LegalVT.getVectorElementType().getSimpleVT(); } + + if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } } return VT.changeVectorElementTypeToInteger(); @@ -1945,7 +1774,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) @@ -1954,7 +1783,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, } unsigned Align = 4; - if (Subtarget->hasSSE1()) + if (Subtarget.hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } @@ -1977,35 +1806,40 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); - if ((!IsMemset || ZeroMemset) && - !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (!Subtarget->isUnalignedMem16Slow() || + (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { - if (Size >= 32) { - // FIXME: Check if unaligned 32-byte accesses are slow. - if (Subtarget->hasInt256()) - return MVT::v8i32; - if (Subtarget->hasFp256()) - return MVT::v8f32; + // FIXME: Check if unaligned 32-byte accesses are slow. + if (Size >= 32 && Subtarget.hasAVX()) { + // Although this isn't a well-supported type for AVX1, we'll let + // legalization and shuffle lowering produce the optimal codegen. If we + // choose an optimal type with a vector element larger than a byte, + // getMemsetStores() may create an intermediate splat (using an integer + // multiply) before we splat as a vector. + return MVT::v32i8; } - if (Subtarget->hasSSE2()) - return MVT::v4i32; - if (Subtarget->hasSSE1()) + if (Subtarget.hasSSE2()) + return MVT::v16i8; + // TODO: Can SSE1 handle a byte vector? + if (Subtarget.hasSSE1()) return MVT::v4f32; - } else if (!MemcpyStrSrc && Size >= 8 && - !Subtarget->is64Bit() && - Subtarget->hasSSE2()) { + } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && + !Subtarget.is64Bit() && Subtarget.hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. + // Also, do not use f64 to lower memset unless this is a memset of zeros. + // The gymnastics of splatting a byte value into an XMM register and then + // only using 8-byte stores (because this is a CPU with slow unaligned + // 16-byte accesses) makes that a loser. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. - if (Subtarget->is64Bit() && Size >= 8) + if (Subtarget.is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } @@ -2030,10 +1864,10 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, *Fast = true; break; case 128: - *Fast = !Subtarget->isUnalignedMem16Slow(); + *Fast = !Subtarget.isUnalignedMem16Slow(); break; case 256: - *Fast = !Subtarget->isUnalignedMem32Slow(); + *Fast = !Subtarget.isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } @@ -2048,8 +1882,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. - if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && - Subtarget->isPICStyleGOT()) + if (isPositionIndependent() && Subtarget.isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. @@ -2057,15 +1890,14 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { } bool X86TargetLowering::useSoftFloat() const { - return Subtarget->useSoftFloat(); + return Subtarget.useSoftFloat(); } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ - assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && - Subtarget->isPICStyleGOT()); + assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), @@ -2075,7 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { - if (!Subtarget->is64Bit()) + if (!Subtarget.is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), @@ -2089,7 +1921,7 @@ const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. - if (Subtarget->isPICStyleRIPRel()) + if (Subtarget.isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. @@ -2105,7 +1937,7 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; + RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; @@ -2121,47 +1953,76 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, return std::make_pair(RRC, Cost); } -bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, - unsigned &Offset) const { - if (!Subtarget->isTargetLinux()) - return false; +unsigned X86TargetLowering::getAddressSpace() const { + if (Subtarget.is64Bit()) + return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; + return 256; +} - if (Subtarget->is64Bit()) { - // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: - Offset = 0x28; - if (getTargetMachine().getCodeModel() == CodeModel::Kernel) - AddressSpace = 256; - else - AddressSpace = 257; - } else { - // %gs:0x14 on i386 - Offset = 0x14; - AddressSpace = 256; +Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // glibc has a special slot for the stack guard in tcbhead_t, use it instead + // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h) + if (!Subtarget.isTargetGlibc()) + return TargetLowering::getIRStackGuard(IRB); + + // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x14 on i386 + unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + unsigned AddressSpace = getAddressSpace(); + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + +void X86TargetLowering::insertSSPDeclarations(Module &M) const { + // MSVC CRT provides functionalities for stack protection. + if (Subtarget.getTargetTriple().isOSMSVCRT()) { + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); + + // MSVC CRT has a function to validate security cookie. + auto *SecurityCheckCookie = cast<Function>( + M.getOrInsertFunction("__security_check_cookie", + Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext()), nullptr)); + SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); + SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + return; } - return true; + // glibc has a special slot for the stack guard. + if (Subtarget.isTargetGlibc()) + return; + TargetLowering::insertSSPDeclarations(M); +} + +Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. + if (Subtarget.getTargetTriple().isOSMSVCRT()) + return M.getGlobalVariable("__security_cookie"); + return TargetLowering::getSDagStackGuard(M); +} + +Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + if (Subtarget.getTargetTriple().isOSMSVCRT()) + return M.getFunction("__security_check_cookie"); + return TargetLowering::getSSPStackGuardCheck(M); } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) + if (!Subtarget.isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h unsigned AddressSpace, Offset; - if (Subtarget->is64Bit()) { - // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: - Offset = 0x48; - if (getTargetMachine().getCodeModel() == CodeModel::Kernel) - AddressSpace = 256; - else - AddressSpace = 257; - } else { - // %gs:0x24 on i386 - Offset = 0x24; - AddressSpace = 256; - } + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x24 on i386 + Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; + AddressSpace = getAddressSpace(); return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); @@ -2194,11 +2055,11 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { } SDValue -X86TargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, +X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - SDLoc dl, SelectionDAG &DAG) const { + const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); @@ -2214,10 +2075,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, - MVT::i16)); + MVT::i32)); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; @@ -2244,14 +2105,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + (Subtarget.is64Bit() && !Subtarget.hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -2269,7 +2130,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); @@ -2277,7 +2138,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. - if (!Subtarget->hasSSE2()) + if (!Subtarget.hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } @@ -2288,6 +2149,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + // Swift calling convention does not require we copy the sret argument + // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. + // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, @@ -2298,11 +2162,30 @@ X86TargetLowering::LowerReturn(SDValue Chain, // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, + // When we have both sret and another return value, we should use the + // original Chain stored in RetOps[0], instead of the current Chain updated + // in the above loop. If we only have sret, RetOps[0] equals to Chain. + + // For the case of sret and another return value, we have + // Chain_0 at the function entry + // Chain_1 = getCopyToReg(Chain_0) in the above loop + // If we use Chain_1 in getCopyFromReg, we will have + // Val = getCopyFromReg(Chain_1) + // Chain_2 = getCopyToReg(Chain_1, Val) from below + + // getCopyToReg(Chain_0) will be glued together with + // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be + // in Unit B, and we will have cyclic dependency between Unit A and Unit B: + // Data dependency from Unit B to Unit A due to usage of Val in + // getCopyToReg(Chain_1, Val) + // Chain dependency from Unit A to Unit B + + // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. + SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg - = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? + = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); @@ -2312,7 +2195,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } - const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { @@ -2337,9 +2220,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { - if (N->getNumValues() != 1) - return false; - if (!N->hasNUsesOfValue(1, 0)) + if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; @@ -2375,15 +2256,19 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -EVT -X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, - ISD::NodeType ExtendKind) const { - MVT ReturnMVT; - // TODO: Is this also valid on 32-bit? - if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) +EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT = MVT::i32; + + bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); + if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { + // The ABI does not require i1, i8 or i16 to be extended. + // + // On Darwin, there is code in the wild relying on Clang's old behaviour of + // always extending i8/i16 return values, so keep doing that for now. + // (PR26665). ReturnMVT = MVT::i8; - else - ReturnMVT = MVT::i32; + } EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; @@ -2392,16 +2277,14 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// -SDValue -X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { +SDValue X86TargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - bool Is64Bit = Subtarget->is64Bit(); + bool Is64Bit = Subtarget.is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); @@ -2413,7 +2296,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2422,6 +2305,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { + if (!Subtarget.hasX87()) + report_fatal_error("X87 register return with X87 disabled"); CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } @@ -2492,10 +2377,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. -static SDValue -CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, - ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - SDLoc dl) { +static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, + SDValue Chain, ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), @@ -2549,13 +2433,11 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { } SDValue -X86TargetLowering::LowerMemArgument(SDValue Chain, - CallingConv::ID CallConv, +X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc dl, SelectionDAG &DAG, + const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, - MachineFrameInfo *MFI, - unsigned i) const { + MachineFrameInfo *MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( @@ -2602,6 +2484,14 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + + // Set SExt or ZExt flag. + if (VA.getLocInfo() == CCValAssign::ZExt) { + MFI->setObjectZExt(FI, true); + } else if (VA.getLocInfo() == CCValAssign::SExt) { + MFI->setObjectSExt(FI, true); + } + // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI->setObjectOffset(FI, Offset); @@ -2610,8 +2500,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, - false, false, 0); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } @@ -2619,10 +2508,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, // FIXME: Get this from tablegen. static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, - const X86Subtarget *Subtarget) { - assert(Subtarget->is64Bit()); + const X86Subtarget &Subtarget) { + assert(Subtarget.is64Bit()); - if (Subtarget->isCallingConvWin64(CallConv)) { + if (Subtarget.isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; @@ -2638,9 +2527,9 @@ static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, // FIXME: Get this from tablegen. static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, - const X86Subtarget *Subtarget) { - assert(Subtarget->is64Bit()); - if (Subtarget->isCallingConvWin64(CallConv)) { + const X86Subtarget &Subtarget) { + assert(Subtarget.is64Bit()); + if (Subtarget.isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. @@ -2650,10 +2539,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, const Function *Fn = MF.getFunction(); bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); - bool isSoftFloat = Subtarget->useSoftFloat(); + bool isSoftFloat = Subtarget.useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) + if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; @@ -2667,21 +2556,21 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const Function* Fn = MF.getFunction(); + const Function *Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && - Subtarget->isTargetCygMing() && + Subtarget.isTargetCygMing() && Fn->getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo *MFI = MF.getFrameInfo(); - bool Is64Bit = Subtarget->is64Bit(); - bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); + bool Is64Bit = Subtarget.is64Bit(); + bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); @@ -2778,13 +2667,18 @@ SDValue X86TargetLowering::LowerFormalArguments( // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) - ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, - MachinePointerInfo(), false, false, false, 0); + ArgValue = + DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); InVals.push_back(ArgValue); } for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // Swift calling convention does not require we copy the sret argument + // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. + if (CallConv == CallingConv::Swift) + continue; + // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the @@ -2819,7 +2713,7 @@ SDValue X86TargetLowering::LowerFormalArguments( } // Figure out if XMM registers are in use. - assert(!(Subtarget->useSoftFloat() && + assert(!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); @@ -2831,7 +2725,7 @@ SDValue X86TargetLowering::LowerFormalArguments( ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. @@ -2865,7 +2759,7 @@ SDValue X86TargetLowering::LowerFormalArguments( } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so - // they may be loaded by deferencing the result of va_next. + // they may be loaded by dereferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( @@ -2884,8 +2778,7 @@ SDValue X86TargetLowering::LowerFormalArguments( DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); + FuncInfo->getRegSaveFrameIndex(), Offset)); MemOps.push_back(Store); Offset += 8; } @@ -2913,13 +2806,13 @@ SDValue X86TargetLowering::LowerFormalArguments( // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget->hasAVX512() && + if (Subtarget.hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; - else if (Subtarget->hasAVX()) + else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; - else if (Subtarget->hasSSE2()) + else if (Subtarget.hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. @@ -2960,8 +2853,8 @@ SDValue X86TargetLowering::LowerFormalArguments( FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && - !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) + !Subtarget.getTargetTriple().isOSMSVCRT() && + argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2987,7 +2880,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough - // space to accomodate holding this slot at the correct offset). + // space to accommodate holding this slot at the correct offset). int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } @@ -2996,12 +2889,11 @@ SDValue X86TargetLowering::LowerFormalArguments( return Chain; } -SDValue -X86TargetLowering::LowerMemOpCallTo(SDValue Chain, - SDValue StackPtr, SDValue Arg, - SDLoc dl, SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const { +SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, + SDValue Arg, const SDLoc &dl, + SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), @@ -3011,24 +2903,20 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, return DAG.getStore( Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), - false, false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } /// Emit a load of return address if tail call /// optimization is performed and it is required. -SDValue -X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, - SDValue &OutRetAddr, SDValue Chain, - bool IsTailCall, bool Is64Bit, - int FPDiff, SDLoc dl) const { +SDValue X86TargetLowering::EmitTailCallLoadRetAddr( + SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, + bool Is64Bit, int FPDiff, const SDLoc &dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. - OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), - false, false, false, 0); + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); return SDValue(OutRetAddr.getNode(), 1); } @@ -3037,7 +2925,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, - int FPDiff, SDLoc dl) { + int FPDiff, const SDLoc &dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. @@ -3047,21 +2935,20 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), NewReturnAddrFI), - false, false, 0); + DAG.getMachineFunction(), NewReturnAddrFI)); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, +static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); + return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } SDValue @@ -3079,9 +2966,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool Is64Bit = Subtarget->is64Bit(); - bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); + bool Is64Bit = Subtarget.is64Bit(); + bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); + StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); @@ -3092,7 +2979,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Attr.getValueAsString() == "true") isTailCall = false; - if (Subtarget->isPICStyleGOT() && + if (Subtarget.isPICStyleGOT() && !MF.getTarget().Options.GuaranteedTailCallOpt) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT @@ -3195,7 +3082,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -3238,8 +3125,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - false, false, 0); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); Arg = SpillSlot; break; } @@ -3273,7 +3159,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); - if (Subtarget->isPICStyleGOT()) { + if (Subtarget.isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { @@ -3314,7 +3200,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); - assert((Subtarget->hasSSE1() || !NumXMMRegs) + assert((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), @@ -3377,8 +3263,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - false, false, 0)); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } @@ -3416,70 +3301,29 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { - unsigned char OpFlags = 0; - bool ExtraLoad = false; - unsigned WrapperKind = ISD::DELETED_NODE; - - // On ELF targets, in both X86-64 and X86-32 mode, direct calls to - // external symbols most go through the PLT in PIC mode. If the symbol - // has hidden or protected visibility, or if it is static or local, then - // we don't need to use the PLT - we can directly call it. - if (Subtarget->isTargetELF() && - DAG.getTarget().getRelocationModel() == Reloc::PIC_ && - GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { - OpFlags = X86II::MO_PLT; - } else if (Subtarget->isPICStyleStubAny() && - !GV->isStrongDefinitionForLinker() && - (!Subtarget->getTargetTriple().isMacOSX() || - Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { - // PC-relative references to external symbols should go through $stub, - // unless we're building with the leopard linker or later, which - // automatically synthesizes these stubs. - OpFlags = X86II::MO_DARWIN_STUB; - } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && - cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { - // If the function is marked as non-lazy, generate an indirect call - // which loads from the GOT directly. This avoids runtime overhead - // at the cost of eager binding (and one extra byte of encoding). - OpFlags = X86II::MO_GOTPCREL; - WrapperKind = X86ISD::WrapperRIP; - ExtraLoad = true; - } + unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); Callee = DAG.getTargetGlobalAddress( GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); - // Add a wrapper if needed. - if (WrapperKind != ISD::DELETED_NODE) + if (OpFlags == X86II::MO_GOTPCREL) { + // Add a wrapper. Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - // Add extra indirection if needed. - if (ExtraLoad) + getPointerTy(DAG.getDataLayout()), Callee); + // Add extra indirection Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, - false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); + } } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - unsigned char OpFlags = 0; - - // On ELF targets, in either X86-64 or X86-32 mode, direct calls to - // external symbols should go through the PLT. - if (Subtarget->isTargetELF() && - DAG.getTarget().getRelocationModel() == Reloc::PIC_) { - OpFlags = X86II::MO_PLT; - } else if (Subtarget->isPICStyleStubAny() && - (!Subtarget->getTargetTriple().isMacOSX() || - Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { - // PC-relative references to external symbols should go through $stub, - // unless we're building with the leopard linker or later, which - // automatically synthesizes these stubs. - OpFlags = X86II::MO_DARWIN_STUB; - } + const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + unsigned char OpFlags = + Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); Callee = DAG.getTargetExternalSymbol( S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); - } else if (Subtarget->isTarget64BitILP32() && + } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); @@ -3552,7 +3396,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && - !Subtarget->getTargetTriple().isOSMSVCRT() && + !Subtarget.getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. @@ -3562,6 +3406,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else NumBytesForCalleeToPop = 0; // Callee pops nothing. + if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { + // No need to reset the stack after the call if the call doesn't return. To + // make the MI verify, we'll pretend the callee does it for us. + NumBytesForCalleeToPop = NumBytes; + } + // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, @@ -3614,8 +3464,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3636,8 +3486,28 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, - const X86InstrInfo *TII) { + const X86InstrInfo *TII, const CCValAssign &VA) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + + for (;;) { + // Look through nodes that don't alter the bits of the incoming value. + unsigned Op = Arg.getOpcode(); + if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { + Arg = Arg.getOperand(0); + continue; + } + if (Op == ISD::TRUNCATE) { + const SDValue &TruncInput = Arg.getOperand(0); + if (TruncInput.getOpcode() == ISD::AssertZext && + cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == + Arg.getValueType()) { + Arg = TruncInput.getOperand(0); + continue; + } + } + break; + } + int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); @@ -3647,7 +3517,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if (!Def) return false; if (!Flags.isByVal()) { - if (!TII->isLoadFromStackSlot(Def, FI)) + if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); @@ -3682,7 +3552,20 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; - return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); + + if (Offset != MFI->getObjectOffset(FI)) + return false; + + if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) { + // If the argument location is wider than the argument type, check that any + // extension flags match. + if (Flags.isZExt() != MFI->isObjectZExt(FI) || + Flags.isSExt() != MFI->isObjectSExt(FI)) { + return false; + } + } + + return Bytes == MFI->getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets @@ -3708,8 +3591,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; - bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); - bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); + bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); + bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this @@ -3728,7 +3611,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3739,6 +3622,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. + LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. @@ -3746,8 +3630,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( return false; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3767,8 +3650,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, false, MF, RVLocs, C); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -3777,34 +3659,17 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( } } - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + RetCC_X86, RetCC_X86)) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { - SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, - *DAG.getContext()); - CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); - - SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, - *DAG.getContext()); - CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); - - if (RVLocs1.size() != RVLocs2.size()) + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } } unsigned StackArgsSize = 0; @@ -3815,8 +3680,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3830,7 +3694,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const X86InstrInfo *TII = Subtarget->getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3839,26 +3703,25 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, - MFI, MRI, TII)) + MFI, MRI, TII, VA)) return false; } } } + bool PositionIndependent = isPositionIndependent(); // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. - if (!Subtarget->is64Bit() && - ((!isa<GlobalAddressSDNode>(Callee) && - !isa<ExternalSymbolSDNode>(Callee)) || - DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) || + PositionIndependent)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. - unsigned MaxInRegs = - (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + unsigned MaxInRegs = PositionIndependent ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3874,10 +3737,14 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( } } } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; } bool CalleeWillPop = - X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, + X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = @@ -3923,6 +3790,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::PALIGNR: + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -3935,16 +3804,30 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: + case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: + case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: + case X86ISD::VPERMIL2: case X86ISD::VPERMI: + case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VZEXT_MOVL: return true; } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, +static bool isTargetShuffleVariableMask(unsigned Opcode) { + switch (Opcode) { + default: return false; + case X86ISD::PSHUFB: + case X86ISD::VPERMILPV: + return true; + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3959,7 +3842,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3978,7 +3861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -4047,17 +3930,20 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { - default: llvm_unreachable("Invalid integer condition!"); - case X86::COND_E: return true; - case X86::COND_G: return false; - case X86::COND_GE: return false; - case X86::COND_L: return false; - case X86::COND_LE: return false; - case X86::COND_NE: return true; - case X86::COND_B: return true; - case X86::COND_A: return true; - case X86::COND_BE: return true; - case X86::COND_AE: return true; + default: + llvm_unreachable("Invalid integer condition!"); + case X86::COND_E: + case X86::COND_NE: + case X86::COND_B: + case X86::COND_A: + case X86::COND_BE: + case X86::COND_AE: + return true; + case X86::COND_G: + case X86::COND_GE: + case X86::COND_L: + case X86::COND_LE: + return false; } } @@ -4080,8 +3966,9 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. -static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, - SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, + bool isFP, SDValue &LHS, SDValue &RHS, + SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { @@ -4181,24 +4068,50 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!IntrData) return false; + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.readMem = false; + Info.writeMem = false; + Info.vol = false; + Info.offset = 0; + switch (IntrData->Type) { - case LOADA: - case LOADU: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(I.getType()); + case EXPAND_FROM_MEM: { Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); - Info.vol = false; + Info.memVT = MVT::getVT(I.getType()); + Info.align = 1; Info.readMem = true; - Info.writeMem = false; - return true; + break; } - default: + case COMPRESS_TO_MEM: { + Info.ptrVal = I.getArgOperand(0); + Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); + Info.align = 1; + Info.writeMem = true; break; } + case TRUNCATE_TO_MEM_VI8: + case TRUNCATE_TO_MEM_VI16: + case TRUNCATE_TO_MEM_VI32: { + Info.ptrVal = I.getArgOperand(0); + MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); + MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + if (IntrData->Type == TRUNCATE_TO_MEM_VI8) + ScalarVT = MVT::i8; + else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) + ScalarVT = MVT::i16; + else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) + ScalarVT = MVT::i32; + + Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); + Info.align = 1; + Info.writeMem = true; + break; + } + default: + return false; + } - return false; + return true; } /// Returns true if the target can instruction select the @@ -4246,12 +4159,24 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. - return Subtarget->hasBMI(); + return Subtarget.hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. - return Subtarget->hasLZCNT(); + return Subtarget.hasLZCNT(); +} + +bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { + if (!Subtarget.hasBMI()) + return false; + + // There are only 32-bit and 64-bit forms for 'andn'. + EVT VT = Y.getValueType(); + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + return true; } /// Return true if every element in Mask, beginning @@ -4269,11 +4194,26 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } +/// Return true if every element in Mask is undef or if its value +/// falls within the specified range (L, H]. +static bool isUndefOrInRange(ArrayRef<int> Mask, + int Low, int Hi) { + for (int M : Mask) + if (!isUndefOrInRange(M, Low, Hi)) + return false; + return true; +} + /// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } +/// Val is either the undef or zero sentinel value. +static bool isUndefOrZero(int Val) { + return (Val == SM_SentinelUndef || Val == SM_SentinelZero); +} + /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. @@ -4285,6 +4225,17 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } +/// Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size, falls within the specified +/// sequential range (Low, Low+Size], or is undef or is zero. +static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, + unsigned Size, int Low) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) + if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) + return false; + return true; +} + /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector /// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { @@ -4399,9 +4350,8 @@ bool X86::isZeroNode(SDValue Elt) { // Build a vector of constants // Use an UNDEF node if MaskElt == -1. // Spilt 64-bit constants in the 32-bit mode. -static SDValue getConstVector(ArrayRef<int> Values, MVT VT, - SelectionDAG &DAG, - SDLoc dl, bool IsMask = false) { +static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, + const SDLoc &dl, bool IsMask = false) { SmallVector<SDValue, 32> Ops; bool Split = false; @@ -4424,63 +4374,40 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } - SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); + SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } /// Returns a vector of specified type with all zero elements. -static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, - SelectionDAG &DAG, SDLoc dl) { - assert(VT.isVector() && "Expected a vector type"); - - // Always build SSE zero vectors as <4 x i32> bitcasted - // to their dest type. This ensures they get CSE'd. +static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, + SelectionDAG &DAG, const SDLoc &dl) { + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || + VT.getVectorElementType() == MVT::i1) && + "Unexpected vector type"); + + // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest + // type. This ensures they get CSE'd. But if the integer type is not + // available, use a floating-point +0.0 instead. SDValue Vec; - if (VT.is128BitVector()) { // SSE - if (Subtarget->hasSSE2()) { // SSE2 - SDValue Cst = DAG.getConstant(0, dl, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } else { // SSE1 - SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); - } - } else if (VT.is256BitVector()) { // AVX - if (Subtarget->hasInt256()) { // AVX2 - SDValue Cst = DAG.getConstant(0, dl, MVT::i32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); - } else { - // 256-bit logic and arithmetic instructions in AVX are all - // floating-point, no support for integer ops. Emit fp zeroed vectors. - SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); - } - } else if (VT.is512BitVector()) { // AVX-512 - SDValue Cst = DAG.getConstant(0, dl, MVT::i32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + if (!Subtarget.hasSSE2() && VT.is128BitVector()) { + Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.getVectorElementType() == MVT::i1) { - - assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) - && "Unexpected vector type"); - assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) - && "Unexpected vector type"); - SDValue Cst = DAG.getConstant(0, dl, MVT::i1); - SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); - } else - llvm_unreachable("Unexpected vector type"); - + assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && + "Unexpected vector type"); + assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && + "Unexpected vector type"); + Vec = DAG.getConstant(0, dl, VT); + } else { + unsigned Num32BitElts = VT.getSizeInBits() / 32; + Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); + } return DAG.getBitcast(VT, Vec); } -static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl, - unsigned vectorWidth) { +static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, + const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); EVT VT = Vec.getValueType(); @@ -4490,7 +4417,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. - if (Vec.getOpcode() == ISD::UNDEF) + if (Vec.isUndef()) return DAG.getUNDEF(ResultVT); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR @@ -4503,8 +4430,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getNode(ISD::BUILD_VECTOR, + dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -4516,27 +4443,27 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { +static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); + return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. -static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { +static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); + return extractSubVector(Vec, IdxVal, DAG, dl, 256); } -static SDValue InsertSubVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl, unsigned vectorWidth) { +static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, const SDLoc &dl, + unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result - if (Vec.getOpcode() == ISD::UNDEF) + if (Vec.isUndef()) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); @@ -4560,8 +4487,8 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { +static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -4570,7 +4497,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, // extend the subvector to the size of the result vector. Make sure that // we are not recursing on that node by checking for undef here. if (IdxVal == 0 && Result.getValueType().is256BitVector() && - Result.getOpcode() != ISD::UNDEF) { + !Result.isUndef()) { EVT ResultVT = Result.getValueType(); SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(ResultVT); @@ -4607,17 +4534,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return DAG.getBitcast(ResultVT, Vec256); } - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); + return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { +static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); + return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } /// Insert i1-subvector to i1-vector. -static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { +static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); @@ -4647,43 +4575,71 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) + // extend to natively supported kshift + MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + MVT WideOpVT = OpVT; + if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits()) + WideOpVT = MinVT; + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(OpVT); - SDValue WideSubVec = - DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); - if (Vec.isUndef()) - return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue Undef = DAG.getUNDEF(WideOpVT); + SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + Undef, SubVec, ZeroIdx); + + // Extract sub-vector if require. + auto ExtractSubVec = [&](SDValue V) { + return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, + OpVT, V, ZeroIdx); + }; + + if (Vec.isUndef()) { + if (IdxVal != 0) { + SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); + } + return ExtractSubVec(WideSubVec); + } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, - DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; + return ExtractSubVec(Vec); } if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - // Merge them together - return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); + Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + // Merge them together, SubVec should be zero extended. + WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + SubVec, ZeroIdx); + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); + return ExtractSubVec(Vec); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); + Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); + return ExtractSubVec(Vec); } // Subvector should be inserted in the middle - use shuffle + WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); SmallVector<int, 64> Mask; for (unsigned i = 0; i < NumElems; ++i) Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? @@ -4695,103 +4651,206 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. -static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, +static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert128BitVector(V, V2, NumElems/2, DAG, dl); + const SDLoc &dl) { + SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return insert128BitVector(V, V2, NumElems / 2, DAG, dl); } -static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, +static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert256BitVector(V, V2, NumElems/2, DAG, dl); + const SDLoc &dl) { + SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return insert256BitVector(V, V2, NumElems / 2, DAG, dl); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with -/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, - SelectionDAG &DAG, SDLoc dl) { - assert(VT.isVector() && "Expected a vector type"); +static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, + SelectionDAG &DAG, const SDLoc &dl) { + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && + "Expected a 128/256/512-bit vector type"); - SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); + APInt Ones = APInt::getAllOnesValue(32); + unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec; - if (VT.is512BitVector()) { - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); - } else if (VT.is256BitVector()) { - if (Subtarget->hasInt256()) { // AVX2 - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); - } else { // AVX - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); - } - } else if (VT.is128BitVector()) { - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } else - llvm_unreachable("Unexpected vector type"); - + if (!Subtarget.hasInt256() && NumElts == 8) { + Vec = DAG.getConstant(Ones, dl, MVT::v4i32); + Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); + } else { + Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); + } return DAG.getBitcast(VT, Vec); } /// Returns a vector_shuffle node for an unpackl operation. -static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, - SDValue V2) { +static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, + SDValue V1, SDValue V2) { + assert(VT.is128BitVector() && "Expected a 128-bit vector type"); unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask; + SmallVector<int, 8> Mask(NumElems); for (unsigned i = 0, e = NumElems/2; i != e; ++i) { - Mask.push_back(i); - Mask.push_back(i + NumElems); + Mask[i * 2] = i; + Mask[i * 2 + 1] = i + NumElems; } - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); + return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. -static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, - SDValue V2) { +static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, + SDValue V1, SDValue V2) { + assert(VT.is128BitVector() && "Expected a 128-bit vector type"); unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask; + SmallVector<int, 8> Mask(NumElems); for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { - Mask.push_back(i + Half); - Mask.push_back(i + NumElems + Half); + Mask[i * 2] = i + Half; + Mask[i * 2 + 1] = i + NumElems + Half; } - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); + return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). -static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 16> MaskVec; - for (unsigned i = 0; i != NumElems; ++i) + int NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> MaskVec(NumElems); + for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. - MaskVec.push_back(i == Idx ? NumElems : i); - return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); + MaskVec[i] = (i == Idx) ? NumElems : i; + return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); +} + +static SDValue peekThroughBitcasts(SDValue V) { + while (V.getNode() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + return V; +} + +static bool getTargetShuffleMaskIndices(SDValue MaskNode, + unsigned MaskEltSizeInBits, + SmallVectorImpl<uint64_t> &RawMask) { + MaskNode = peekThroughBitcasts(MaskNode); + + MVT VT = MaskNode.getSimpleValueType(); + assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); + + // Split an APInt element into MaskEltSizeInBits sized pieces and + // insert into the shuffle mask. + auto SplitElementToMask = [&](APInt Element) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits; + for (int i = 0; i < Split; ++i) { + APInt RawElt = Element.getLoBits(MaskEltSizeInBits); + Element = Element.lshr(MaskEltSizeInBits); + RawMask.push_back(RawElt.getZExtValue()); + } + }; + + if (MaskNode.getOpcode() == X86ISD::VBROADCAST) { + // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 + // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0 + if (VT.getScalarSizeInBits() != MaskEltSizeInBits) + return false; + if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) { + const APInt &MaskElement = CN->getAPIntValue(); + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits); + RawMask.push_back(RawElt.getZExtValue()); + } + } + return false; + } + + if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && + MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { + + // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 + if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) + return false; + unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; + + SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); + if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { + SplitElementToMask(CN->getAPIntValue()); + RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); + return true; + } + return false; + } + + if (MaskNode.getOpcode() != ISD::BUILD_VECTOR) + return false; + + // We can always decode if the buildvector is all zero constants, + // but can't use isBuildVectorAllZeros as it might contain UNDEFs. + if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) { + RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0); + return true; + } + + // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 + if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) + return false; + + for (SDValue Op : MaskNode->ops()) { + if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode())) + SplitElementToMask(CN->getAPIntValue()); + else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode())) + SplitElementToMask(CFN->getValueAPF().bitcastToAPInt()); + else + return false; + } + + return true; +} + +static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) { + MaskNode = peekThroughBitcasts(MaskNode); + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return nullptr; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return nullptr; + + return dyn_cast<Constant>(MaskCP->getConstVal()); } /// Calculates the shuffle mask corresponding to the target-specific opcode. -/// Returns true if the Mask could be calculated. Sets IsUnary to true if only -/// uses one source. Note that this will set IsUnary for shuffles which use a -/// single input multiple times, and in those cases it will -/// adjust the mask to only have indices within that single input. +/// If the mask could be calculated, returns it in \p Mask, returns the shuffle +/// operands in \p Ops, and returns true. +/// Sets \p IsUnary to true if only one source is used. Note that this will set +/// IsUnary for shuffles which use a single input multiple times, and in those +/// cases it will adjust the mask to only have indices within that single input. +/// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, + SmallVectorImpl<SDValue> &Ops, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; + assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); + assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); + IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { @@ -4826,9 +4885,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: + assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); break; + case X86ISD::VSHLDQ: + assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + ImmN = N->getOperand(N->getNumOperands() - 1); + DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::VSRLDQ: + assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + ImmN = N->getOperand(N->getNumOperands() - 1); + DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); @@ -4845,70 +4917,51 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::VZEXT_MOVL: + DecodeZeroMoveLowMask(VT, Mask); + IsUnary = true; + break; + case X86ISD::VBROADCAST: { + // We only decode broadcasts of same-sized vectors at the moment. + if (N->getOperand(0).getValueType() == VT) { + DecodeVectorBroadcast(VT, Mask); + IsUnary = true; + break; + } + return false; + } + case X86ISD::VPERMILPV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + unsigned MaskEltSize = VT.getScalarSizeInBits(); + SmallVector<uint64_t, 32> RawMask; + if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { + DecodeVPERMILPMask(VT, RawMask, Mask); + break; + } + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + DecodeVPERMILPMask(C, MaskEltSize, Mask); + break; + } + return false; + } case X86ISD::PSHUFB: { IsUnary = true; SDValue MaskNode = N->getOperand(1); - while (MaskNode->getOpcode() == ISD::BITCAST) - MaskNode = MaskNode->getOperand(0); - - if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { - // If we have a build-vector, then things are easy. - MVT VT = MaskNode.getSimpleValueType(); - assert(VT.isVector() && - "Can't produce a non-vector with a build_vector!"); - if (!VT.isInteger()) - return false; - - int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; - - SmallVector<uint64_t, 32> RawMask; - for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { - SDValue Op = MaskNode->getOperand(i); - if (Op->getOpcode() == ISD::UNDEF) { - RawMask.push_back((uint64_t)SM_SentinelUndef); - continue; - } - auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); - if (!CN) - return false; - APInt MaskElement = CN->getAPIntValue(); - - // We now have to decode the element which could be any integer size and - // extract each byte of it. - for (int j = 0; j < NumBytesPerElement; ++j) { - // Note that this is x86 and so always little endian: the low byte is - // the first byte of the mask. - RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); - MaskElement = MaskElement.lshr(8); - } - } + SmallVector<uint64_t, 32> RawMask; + if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { DecodePSHUFBMask(RawMask, Mask); break; } - - auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); - if (!MaskLoad) - return false; - - SDValue Ptr = MaskLoad->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); - - auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); - if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) - return false; - - if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { DecodePSHUFBMask(C, Mask); break; } - return false; } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: @@ -4937,110 +4990,63 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPERMIL2: { + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + unsigned MaskEltSize = VT.getScalarSizeInBits(); + SDValue MaskNode = N->getOperand(2); + SDValue CtrlNode = N->getOperand(3); + if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) { + unsigned CtrlImm = CtrlOp->getZExtValue(); + SmallVector<uint64_t, 32> RawMask; + if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { + DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask); + break; + } + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask); + break; + } + } + return false; + } + case X86ISD::VPPERM: { + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + SDValue MaskNode = N->getOperand(2); + SmallVector<uint64_t, 32> RawMask; + if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { + DecodeVPPERMMask(RawMask, Mask); + break; + } + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + DecodeVPPERMMask(C, Mask); + break; + } + return false; + } case X86ISD::VPERMV: { IsUnary = true; + // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. + Ops.push_back(N->getOperand(1)); SDValue MaskNode = N->getOperand(0); - while (MaskNode->getOpcode() == ISD::BITCAST) - MaskNode = MaskNode->getOperand(0); - - unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); SmallVector<uint64_t, 32> RawMask; - if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { - // If we have a build-vector, then things are easy. - assert(MaskNode.getSimpleValueType().isInteger() && - MaskNode.getSimpleValueType().getVectorNumElements() == - VT.getVectorNumElements()); - - for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { - SDValue Op = MaskNode->getOperand(i); - if (Op->getOpcode() == ISD::UNDEF) - RawMask.push_back((uint64_t)SM_SentinelUndef); - else if (isa<ConstantSDNode>(Op)) { - APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); - RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); - } else - return false; - } + unsigned MaskEltSize = VT.getScalarSizeInBits(); + if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { DecodeVPERMVMask(RawMask, Mask); break; } - if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { - unsigned NumEltsInMask = MaskNode->getNumOperands(); - MaskNode = MaskNode->getOperand(0); - if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) { - APInt MaskEltValue = CN->getAPIntValue(); - for (unsigned i = 0; i < NumEltsInMask; ++i) - RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); - DecodeVPERMVMask(RawMask, Mask); - break; - } - // It may be a scalar load - } - - auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); - if (!MaskLoad) - return false; - - SDValue Ptr = MaskLoad->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); - - auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); - if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) - return false; - - if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { DecodeVPERMVMask(C, VT, Mask); break; } return false; } case X86ISD::VPERMV3: { - IsUnary = false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); + // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); - while (MaskNode->getOpcode() == ISD::BITCAST) - MaskNode = MaskNode->getOperand(1); - - if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { - // If we have a build-vector, then things are easy. - assert(MaskNode.getSimpleValueType().isInteger() && - MaskNode.getSimpleValueType().getVectorNumElements() == - VT.getVectorNumElements()); - - SmallVector<uint64_t, 32> RawMask; - unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); - - for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { - SDValue Op = MaskNode->getOperand(i); - if (Op->getOpcode() == ISD::UNDEF) - RawMask.push_back((uint64_t)SM_SentinelUndef); - else { - auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); - if (!CN) - return false; - APInt MaskElement = CN->getAPIntValue(); - RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); - } - } - DecodeVPERMV3Mask(RawMask, Mask); - break; - } - - auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); - if (!MaskLoad) - return false; - - SDValue Ptr = MaskLoad->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); - - auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); - if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) - return false; - - if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { DecodeVPERMV3Mask(C, VT, Mask); break; } @@ -5055,8 +5061,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) + if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two @@ -5067,6 +5072,123 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, if (M >= (int)Mask.size()) M -= Mask.size(); + // If we didn't already add operands in the opcode-specific code, default to + // adding 1 or 2 operands starting at 0. + if (Ops.empty()) { + Ops.push_back(N->getOperand(0)); + if (!IsUnary || IsFakeUnary) + Ops.push_back(N->getOperand(1)); + } + + return true; +} + +/// Check a target shuffle mask's inputs to see if we can set any values to +/// SM_SentinelZero - this is for elements that are known to be zero +/// (not just zeroable) from their inputs. +/// Returns true if the target shuffle mask was decoded. +static bool setTargetShuffleZeroElements(SDValue N, + SmallVectorImpl<int> &Mask, + SmallVectorImpl<SDValue> &Ops) { + bool IsUnary; + if (!isTargetShuffle(N.getOpcode())) + return false; + if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops, + Mask, IsUnary)) + return false; + + SDValue V1 = Ops[0]; + SDValue V2 = IsUnary ? V1 : Ops[1]; + + V1 = peekThroughBitcasts(V1); + V2 = peekThroughBitcasts(V2); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + + // Already decoded as SM_SentinelZero / SM_SentinelUndef. + if (M < 0) + continue; + + // Determine shuffle input and normalize the mask. + SDValue V = M < Size ? V1 : V2; + M %= Size; + + // We are referencing an UNDEF input. + if (V.isUndef()) { + Mask[i] = SM_SentinelUndef; + continue; + } + + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + // If the BUILD_VECTOR has fewer elements then the (larger) source + // element must be UNDEF/ZERO. + // TODO: Is it worth testing the individual bits of a constant? + if ((Size % V.getNumOperands()) == 0) { + int Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef()) + Mask[i] = SM_SentinelUndef; + else if (X86::isZeroNode(Op)) + Mask[i] = SM_SentinelZero; + continue; + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be all UNDEF or all ZERO. + if ((V.getNumOperands() % Size) == 0) { + int Scale = V->getNumOperands() / Size; + bool AllUndef = true; + bool AllZero = true; + for (int j = 0; j < Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllUndef &= Op.isUndef(); + AllZero &= X86::isZeroNode(Op); + } + if (AllUndef) + Mask[i] = SM_SentinelUndef; + else if (AllZero) + Mask[i] = SM_SentinelZero; + continue; + } + } + + return true; +} + +/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs +/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the +/// remaining input indices in case we now have a unary shuffle and adjust the +/// Op0/Op1 inputs accordingly. +/// Returns true if the target shuffle mask was decoded. +static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, + SmallVectorImpl<int> &Mask) { + SmallVector<SDValue, 2> Ops; + if (!setTargetShuffleZeroElements(Op, Mask, Ops)) + return false; + + int NumElts = Mask.size(); + bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { + return 0 <= Idx && Idx < NumElts; + }); + bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), + [NumElts](int Idx) { return NumElts <= Idx; }); + + Op0 = Op0InUse ? Ops[0] : SDValue(); + Op1 = Op1InUse ? Ops[1] : SDValue(); + + // We're only using Op1 - commute the mask and inputs. + if (!Op0InUse && Op1InUse) { + for (int &M : Mask) + if (NumElts <= M) + M -= NumElts; + Op0 = Op1; + Op1 = SDValue(); + } + return true; } @@ -5097,19 +5219,24 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); + MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; + SmallVector<SDValue, 16> ShuffleOps; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; + if (Elt == SM_SentinelZero) + return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) + : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); if (Elt == SM_SentinelUndef) - return DAG.getUNDEF(ShufVT.getVectorElementType()); + return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); - SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); + SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -5138,7 +5265,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, - const X86Subtarget* Subtarget, + const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); @@ -5148,7 +5275,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, bool First = true; // SSE4.1 - use PINSRB to insert each byte directly. - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { @@ -5208,7 +5335,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, - const X86Subtarget* Subtarget, + const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); @@ -5237,13 +5364,13 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); - Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); @@ -5296,12 +5423,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, // Let the shuffle legalizer deal with blend operations. SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) - V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + V1 = DAG.getBitcast(VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); } // See if we can lower this build_vector to a INSERTPS. - if (!Subtarget->hasSSE41()) + if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); @@ -5326,9 +5453,9 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) - V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) - V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); + V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); @@ -5342,11 +5469,11 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, } /// Return a vector logical shift node. -static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, - unsigned NumBits, SelectionDAG &DAG, - const TargetLowering &TLI, SDLoc dl) { +static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, + SelectionDAG &DAG, const TargetLowering &TLI, + const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); - MVT ShVT = MVT::v2i64; + MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); @@ -5355,8 +5482,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } -static SDValue -LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { +static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, + SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into @@ -5418,12 +5545,11 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, - LD->getPointerInfo().getWithOffset(StartOffset), - false, false, false, 0); + LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector<int, 8> Mask(NumElems, EltNo); - return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); @@ -5433,55 +5559,103 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// -/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a -/// -/// FIXME: we'd also like to handle the case where the last elements are zero -/// rather than undef via VZEXT_LOAD, but we do not detect that case today. -/// There's even a handy isZeroNode for that purpose. +/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { unsigned NumElems = Elts.size(); - LoadSDNode *LDBase = nullptr; - unsigned LastLoadedElt = -1U; + int LastLoadedElt = -1; + SmallBitVector LoadMask(NumElems, false); + SmallBitVector ZeroMask(NumElems, false); + SmallBitVector UndefMask(NumElems, false); - // For each element in the initializer, see if we've found a load or an undef. - // If we don't find an initial load element, or later load elements are - // non-consecutive, bail out. + // For each element in the initializer, see if we've found a load, zero or an + // undef. for (unsigned i = 0; i < NumElems; ++i) { - SDValue Elt = Elts[i]; - // Look through a bitcast. - if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) - Elt = Elt.getOperand(0); - if (!Elt.getNode() || - (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + SDValue Elt = peekThroughBitcasts(Elts[i]); + if (!Elt.getNode()) return SDValue(); - if (!LDBase) { - if (Elt.getNode()->getOpcode() == ISD::UNDEF) - return SDValue(); - LDBase = cast<LoadSDNode>(Elt.getNode()); - LastLoadedElt = i; - continue; - } - if (Elt.getOpcode() == ISD::UNDEF) - continue; - LoadSDNode *LD = cast<LoadSDNode>(Elt); - EVT LdVT = Elt.getValueType(); - // Each loaded element must be the correct fractional portion of the - // requested vector load. - if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) - return SDValue(); - if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) + if (Elt.isUndef()) + UndefMask[i] = true; + else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) + ZeroMask[i] = true; + else if (ISD::isNON_EXTLoad(Elt.getNode())) { + LoadMask[i] = true; + LastLoadedElt = i; + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + return SDValue(); + } else return SDValue(); - LastLoadedElt = i; } + assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && + "Incomplete element masks"); + + // Handle Special Cases - all undef or undef/zero. + if (UndefMask.count() == NumElems) + return DAG.getUNDEF(VT); + + // FIXME: Should we return this as a BUILD_VECTOR instead? + if ((ZeroMask | UndefMask).count() == NumElems) + return VT.isInteger() ? DAG.getConstant(0, DL, VT) + : DAG.getConstantFP(0.0, DL, VT); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + int FirstLoadedElt = LoadMask.find_first(); + SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); + LoadSDNode *LDBase = cast<LoadSDNode>(EltBase); + EVT LDBaseVT = EltBase.getValueType(); + + // Consecutive loads can contain UNDEFS but not ZERO elements. + // Consecutive loads with UNDEFs and ZEROs elements require a + // an additional shuffle stage to clear the ZERO elements. + bool IsConsecutiveLoad = true; + bool IsConsecutiveLoadWithZeros = true; + for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { + if (LoadMask[i]) { + SDValue Elt = peekThroughBitcasts(Elts[i]); + LoadSDNode *LD = cast<LoadSDNode>(Elt); + if (!DAG.areNonVolatileConsecutiveLoads( + LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, + i - FirstLoadedElt)) { + IsConsecutiveLoad = false; + IsConsecutiveLoadWithZeros = false; + break; + } + } else if (ZeroMask[i]) { + IsConsecutiveLoad = false; + } + } + + auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { + auto MMOFlags = LDBase->getMemOperand()->getFlags(); + assert(!(MMOFlags & MachineMemOperand::MOVolatile) && + "Cannot merge volatile loads."); + SDValue NewLd = + DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; + }; + // LOAD - all consecutive load/undefs (must start/end with a load). // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. If we found - // consecutive loads for the low half, generate a vzext_load node. - if (LastLoadedElt == NumElems - 1) { + // load of the entire vector width starting at the base pointer. + // If the vector contains zeros, then attempt to shuffle those elements. + if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); // Ensure that the input vector size for the merged loads matches the @@ -5489,72 +5663,93 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) return SDValue(); - if (isAfterLegalize && - !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); - SDValue NewLd = SDValue(); + if (IsConsecutiveLoad) + return CreateLoad(VT, LDBase); + + // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded + // vector and a zero vector to clear out the zero elements. + if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { + SmallVector<int, 4> ClearMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + if (ZeroMask[i]) + ClearMask[i] = i + NumElems; + else if (LoadMask[i]) + ClearMask[i] = i; + } + SDValue V = CreateLoad(VT, LDBase); + SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) + : DAG.getConstantFP(0.0, DL, VT); + return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); + } + } + + int LoadSize = + (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); + + // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && + ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { + MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64); + if (TLI.isTypeLegal(VecVT)) { + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), LDBase->isVolatile(), - LDBase->isNonTemporal(), LDBase->isInvariant(), - LDBase->getAlignment()); + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, + // and update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); + return DAG.getBitcast(VT, ResNode); } - - return NewLd; } - //TODO: The code below fires only for for loading the low v2i32 / v2f32 - //of a v4i32 / v4f32. It's probably worth generalizing. - EVT EltVT = VT.getVectorElementType(); - if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && - DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); - SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, - LDBase->getPointerInfo(), - LDBase->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - - // Make sure the newly-created LOAD is in the same position as LDBase in - // terms of dependency. We create a TokenFactor for LDBase and ResNode, and - // update uses of LDBase's output chain to use the TokenFactor. - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(ResNode.getNode(), 1)); + // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && + ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { + MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32; + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32); + if (TLI.isTypeLegal(VecVT)) { + SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase) + : DAG.getBitcast(VecSVT, EltBase); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); + return DAG.getBitcast(VT, V); } - - return DAG.getBitcast(VT, ResNode); } + return SDValue(); } -/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction -/// to generate a splat value for the following cases: +/// Attempt to use the vbroadcast instruction to generate a splat value for the +/// following cases: /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, +static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. - if (!Subtarget->hasAVX()) + if (!Subtarget.hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -5604,12 +5799,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { - if (!Subtarget->hasInt256()) + if (!Subtarget.hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. if (VT.getSizeInBits() >= 256) - Sc = Extract128BitVector(Sc, 0, DAG, dl); + Sc = extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } @@ -5622,7 +5817,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // Constants may have multiple users. // AVX-512 has register version of the broadcast - bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() && Ld.getValueType().getSizeInBits() >= 32; if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && !hasRegVer)) @@ -5647,7 +5842,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. - if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { + if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); @@ -5656,7 +5851,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || - (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { + (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5671,8 +5866,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, - false, false, Alignment); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5681,7 +5876,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. - if (!IsLoad && Subtarget->hasInt256() && + if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); @@ -5690,12 +5885,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || - (Subtarget->hasVLX() && ScalarSize == 64)) + (Subtarget.hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm - if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { + if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5801,7 +5996,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); - SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), @@ -5818,7 +6013,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); - if (In.getOpcode() != ISD::UNDEF) + if (!In.isUndef()) Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; } SDLoc dl(Op); @@ -5835,17 +6030,11 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); - if (ISD::isBuildVectorAllZeros(Op.getNode())) { - SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); - } + if (ISD::isBuildVectorAllZeros(Op.getNode())) + return DAG.getTargetConstant(0, dl, VT); - if (ISD::isBuildVectorAllOnes(Op.getNode())) { - SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); - } + if (ISD::isBuildVectorAllOnes(Op.getNode())) + return DAG.getTargetConstant(1, dl, VT); if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { SDValue Imm = ConvertI1VectorToInteger(Op, DAG); @@ -5864,7 +6053,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); - if (In.getOpcode() == ISD::UNDEF) + if (In.isUndef()) continue; if (!isa<ConstantSDNode>(In)) NonConstIdx.push_back(idx); @@ -5872,7 +6061,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; HasConstElts = true; } - if (SplatIdx == -1) + if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; @@ -5903,7 +6092,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0, dl)); } - for (unsigned i = 0; i < NonConstIdx.size(); ++i) { + for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), @@ -5948,7 +6137,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. - if (Op->getOpcode() == ISD::UNDEF) { + if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; @@ -5978,13 +6167,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { - if (V0.getOpcode() == ISD::UNDEF) { + if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { - if (V1.getOpcode() == ISD::UNDEF) { + if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; @@ -6041,37 +6230,35 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, - SDLoc DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { - EVT VT = V0.getValueType(); - assert(VT.is256BitVector() && VT == V1.getValueType() && + MVT VT = V0.getSimpleValueType(); + assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); - SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); - SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); - SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); - SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); - EVT NewVT = V0_LO.getValueType(); + SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); + SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); + SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); + SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); + MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. - if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); - if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. - if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || - V1_LO->getOpcode() != ISD::UNDEF)) + if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); - if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || - V1_HI->getOpcode() != ISD::UNDEF)) + if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } @@ -6081,10 +6268,10 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); - if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); SDLoc DL(BV); @@ -6142,12 +6329,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, SubFound = true; // Update InVec0 and InVec1. - if (InVec0.getOpcode() == ISD::UNDEF) { + if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return SDValue(); } - if (InVec1.getOpcode() == ISD::UNDEF) { + if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return SDValue(); @@ -6174,8 +6361,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. - if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && - InVec1.getOpcode() != ISD::UNDEF) + if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef()) return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); return SDValue(); @@ -6183,7 +6369,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -6193,11 +6379,11 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) - if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) - if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + if (BV->getOperand(i)->isUndef()) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the @@ -6207,14 +6393,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, SDLoc DL(BV); SDValue InVec0, InVec1; - if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); - } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { + } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); @@ -6223,7 +6409,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } - if (!Subtarget->hasAVX()) + if (!Subtarget.hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { @@ -6232,18 +6418,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && - ((InVec0.getOpcode() == ISD::UNDEF || - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && - ((InVec1.getOpcode() == ISD::UNDEF || - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && - ((InVec0.getOpcode() == ISD::UNDEF || - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && - ((InVec1.getOpcode() == ISD::UNDEF || - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. @@ -6253,17 +6435,13 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && - ((InVec0.getOpcode() == ISD::UNDEF || - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && - ((InVec1.getOpcode() == ISD::UNDEF || - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && - ((InVec0.getOpcode() == ISD::UNDEF || - InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && - ((InVec1.getOpcode() == ISD::UNDEF || - InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && + ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; @@ -6271,7 +6449,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. - if (Subtarget->hasAVX2()) + if (Subtarget.hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal @@ -6289,7 +6467,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || - VT == MVT::v16i16) && Subtarget->hasAVX()) { + VT == MVT::v16i16) && Subtarget.hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; @@ -6318,39 +6496,101 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, return SDValue(); } -SDValue -X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - SDLoc dl(Op); - +/// If a BUILD_VECTOR's source elements all apply the same bit operation and +/// one of their operands is constant, lower to a pair of BUILD_VECTOR and +/// just apply the bit to the vectors. +/// NOTE: Its not in our interest to start make a general purpose vectorizer +/// from this, but enough scalar bit operations are created from the later +/// legalization + scalarization stages to need basic support. +static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - MVT ExtVT = VT.getVectorElementType(); - unsigned NumElems = Op.getNumOperands(); + unsigned NumElems = VT.getVectorNumElements(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Generate vectors for predicate vectors. - if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) - return LowerBUILD_VECTORvXi1(Op, DAG); + // Check that all elements have the same opcode. + // TODO: Should we allow UNDEFS and if so how many? + unsigned Opcode = Op.getOperand(0).getOpcode(); + for (unsigned i = 1; i < NumElems; ++i) + if (Opcode != Op.getOperand(i).getOpcode()) + return SDValue(); - // Vectors containing all zeros can be matched by pxor and xorps later + // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). + switch (Opcode) { + default: + return SDValue(); + case ISD::AND: + case ISD::XOR: + case ISD::OR: + if (!TLI.isOperationLegalOrPromote(Opcode, VT)) + return SDValue(); + break; + } + + SmallVector<SDValue, 4> LHSElts, RHSElts; + for (SDValue Elt : Op->ops()) { + SDValue LHS = Elt.getOperand(0); + SDValue RHS = Elt.getOperand(1); + + // We expect the canonicalized RHS operand to be the constant. + if (!isa<ConstantSDNode>(RHS)) + return SDValue(); + LHSElts.push_back(LHS); + RHSElts.push_back(RHS); + } + + SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); + SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); + return DAG.getNode(Opcode, DL, VT, LHS, RHS); +} + +/// Create a vector constant without a load. SSE/AVX provide the bare minimum +/// functionality to do this, so it's all zeros, all ones, or some derivation +/// that is cheap to calculate. +static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + + // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; - return getZeroVector(VT, Subtarget, DAG, dl); + return getZeroVector(VT, Subtarget, DAG, DL); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. - if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { - if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) + if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { + if (VT == MVT::v4i32 || VT == MVT::v16i32 || + (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; - if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget, DAG, dl); + return getOnesVector(VT, Subtarget, DAG, DL); } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + + MVT VT = Op.getSimpleValueType(); + MVT ExtVT = VT.getVectorElementType(); + unsigned NumElems = Op.getNumOperands(); + + // Generate vectors for predicate vectors. + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) + return LowerBUILD_VECTORvXi1(Op, DAG); + + if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) + return VectorConstant; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) return AddSub; @@ -6358,6 +6598,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return HorizontalOp; if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; + if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG)) + return BitOp; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -6368,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallSet<SDValue, 8> Values; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); - if (Elt.getOpcode() == ISD::UNDEF) + if (Elt.isUndef()) continue; Values.insert(Elt); if (Elt.getOpcode() != ISD::Constant && @@ -6397,7 +6639,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. - if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && + if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. @@ -6422,7 +6664,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || - (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + (ExtVT == MVT::i64 && Subtarget.is64Bit())) { if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, @@ -6439,16 +6681,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - if (VT.is256BitVector()) { - if (Subtarget->hasAVX()) { - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); + if (VT.getSizeInBits() >= 256) { + MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); + if (Subtarget.hasAVX()) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } else { // Without AVX, we need to extend to a 128-bit vector and then // insert into the 256-bit vector. Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); - Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); + Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); @@ -6504,28 +6747,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); - // For AVX-length vectors, see if we can use a vector load to get all of the - // elements, otherwise build the individual 128-bit pieces and use + // See if we can use a vector load to get all of the elements. + if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { + SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); + if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false)) + return LD; + } + + // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { - SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); - - // Check for a build vector of consecutive loads. - if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) - return LD; + SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. - SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, - makeArrayRef(&V[0], NumElems/2)); - SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, - makeArrayRef(&V[NumElems / 2], NumElems/2)); + SDValue Lower = + DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2)); + SDValue Upper = DAG.getBuildVector( + HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) - return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); - return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -6557,30 +6802,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return V; // If element VT is == 32 bits, turn it into a number of shuffles. - SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { + SmallVector<SDValue, 8> Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget, DAG, dl); + Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { default: break; case 0: - V[i] = V[i*2]; // Must be a zero vector. + Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: - V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); + Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: - V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); + Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: - V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); + Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } @@ -6593,32 +6838,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { static_cast<int>(Reverse2 ? NumElems+1 : NumElems), static_cast<int>(Reverse2 ? NumElems : NumElems+1) }; - return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); + return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } if (Values.size() > 1 && VT.is128BitVector()) { - // Check for a build vector of consecutive loads. - for (unsigned i = 0; i < NumElems; ++i) - V[i] = Op.getOperand(i); - - // Check for elements which are consecutive loads. - if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) - return LD; - // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { SDValue Result; - if (Op.getOperand(0).getOpcode() != ISD::UNDEF) + if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { - if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; + if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } @@ -6628,11 +6865,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). + SmallVector<SDValue, 8> Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { - if (Op.getOperand(i).getOpcode() != ISD::UNDEF) - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + if (!Op.getOperand(i).isUndef()) + Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else - V[i] = DAG.getUNDEF(VT); + Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: @@ -6642,20 +6880,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned EltStride = NumElems >> 1; while (EltStride != 0) { for (unsigned i = 0; i < EltStride; ++i) { - // If V[i+EltStride] is undef and this is the first round of mixing, + // If Ops[i+EltStride] is undef and this is the first round of mixing, // then it is safe to just drop this shuffle: V[i] is already in the // right place, the one element (since it's the first round) being // inserted as undef can be dropped. This isn't safe for successive // rounds because they will permute elements within both vectors. - if (V[i+EltStride].getOpcode() == ISD::UNDEF && + if (Ops[i+EltStride].isUndef() && EltStride == NumElems/2) continue; - V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); + Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]); } EltStride >>= 1; } - return V[0]; + return Ops[0]; } return SDValue(); } @@ -6673,21 +6911,23 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if (ResVT.is256BitVector()) - return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); - return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), - Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); + return concat256BitVectors( + concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl), + concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT, + NumElems, DAG, dl); } - return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); @@ -6764,7 +7004,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, } static SDValue LowerCONCAT_VECTORS(SDValue Op, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) @@ -6800,24 +7040,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef<int> Mask) { - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] != -1 && Mask[i] != i) - return false; - return true; -} - -/// \brief Helper function to classify a mask as a single-input mask. -/// -/// This isn't a generic single-input test because in the vector shuffle -/// lowering we canonicalize single inputs to be the first input operand. This -/// means we can more quickly test for a single input by only checking whether -/// an input from the second operand exists. We also assume that the size of -/// mask corresponds to the size of the input vectors which isn't true in the -/// fully general case. -static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { - for (int M : Mask) - if (M >= (int)Mask.size()) + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + assert(Mask[i] >= -1 && "Out of bound mask element!"); + if (Mask[i] >= 0 && Mask[i] != i) return false; + } return true; } @@ -6835,22 +7062,22 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { return false; } -/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// \brief Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same -/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is -/// *not* suitable for use with existing 128-bit shuffles as it will contain -/// entries from both V1 and V2 inputs to the wider mask. -static bool -is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, - SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); - RepeatedMask.resize(LaneSize, -1); +/// suitable for use with existing 128-bit shuffles as entries from the second +/// vector have been remapped to [LaneSize, 2*LaneSize). +static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) @@ -6860,17 +7087,55 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. - if (RepeatedMask[i % LaneSize] == -1) + // Adjust second vector indices to start at LaneSize instead of Size. + int LocalM = Mask[i] < Size ? Mask[i] % LaneSize + : Mask[i] % LaneSize + LaneSize; + if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. - RepeatedMask[i % LaneSize] = - Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; - else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + RepeatedMask[i % LaneSize] = LocalM; + else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } +/// Test whether a shuffle mask is equivalent within each 128-bit lane. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); +} + +/// Test whether a shuffle mask is equivalent within each 256-bit lane. +static bool +is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); +} + +static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, + SmallVectorImpl<int> &ScaledMask) { + assert(0 < Scale && "Unexpected scaling factor"); + int NumElts = Mask.size(); + ScaledMask.assign(NumElts * Scale, -1); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + + // Repeat sentinel values in every mask element. + if (M < 0) { + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = M; + continue; + } + + // Scale mask element and increment across each mask element. + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = (Scale * M) + s; + } +} + /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -6893,8 +7158,9 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); - for (int i = 0; i < Size; ++i) - if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { + for (int i = 0; i < Size; ++i) { + assert(Mask[i] >= -1 && "Out of bound mask element!"); + if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || @@ -6902,6 +7168,32 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } +} + + return true; +} + +/// Checks whether a target shuffle mask is equivalent to an explicit pattern. +/// +/// The masks must be exactly the same width. +/// +/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding +/// value in ExpectedMask is always accepted. Otherwise the indices must match. +/// +/// SM_SentinelZero is accepted as a valid negative index but must match in both. +static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, + ArrayRef<int> ExpectedMask) { + int Size = Mask.size(); + if (Size != (int)ExpectedMask.size()) + return false; + + for (int i = 0; i < Size; ++i) + if (Mask[i] == SM_SentinelUndef) + continue; + else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) + return false; + else if (Mask[i] != ExpectedMask[i]) + return false; return true; } @@ -6914,8 +7206,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, - SelectionDAG &DAG) { +static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); @@ -6923,11 +7214,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; - Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; - Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; - Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; - Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; - return DAG.getConstant(Imm, DL, MVT::i8); + Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; + return Imm; +} + +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, + SelectionDAG &DAG) { + return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. @@ -6941,15 +7237,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); - - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1->getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2->getOperand(0); + V1 = peekThroughBitcasts(V1); + V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + int VectorSizeInBits = V1.getValueType().getSizeInBits(); + int ScalarSizeInBits = VectorSizeInBits / Mask.size(); + assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); + for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. @@ -6958,38 +7255,119 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, continue; } - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. + // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + M %= Size; + + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) continue; - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; + // If the BUILD_VECTOR has fewer elements then the bitcasted portion of + // the (larger) source element must be UNDEF/ZERO. + if ((Size % V.getNumOperands()) == 0) { + int Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef() || X86::isZeroNode(Op)) + Zeroable[i] = true; + else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + APInt Val = Cst->getAPIntValue(); + Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val = Val.getLoBits(ScalarSizeInBits); + Zeroable[i] = (Val == 0); + } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { + APInt Val = Cst->getValueAPF().bitcastToAPInt(); + Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val = Val.getLoBits(ScalarSizeInBits); + Zeroable[i] = (Val == 0); + } + continue; + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be UNDEF or ZERO. + if ((V.getNumOperands() % Size) == 0) { + int Scale = V->getNumOperands() / Size; + bool AllZeroable = true; + for (int j = 0; j < Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); + } + Zeroable[i] = AllZeroable; + continue; + } } return Zeroable; } +/// Try to lower a shuffle with a single PSHUFB of V1. +/// This is only possible if V2 is unused (at all, or only for zero elements). +static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + int Size = Mask.size(); + int LaneSize = 128 / VT.getScalarSizeInBits(); + const int NumBytes = VT.getSizeInBits() / 8; + const int NumEltBytes = VT.getScalarSizeInBits() / 8; + + assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || + (Subtarget.hasAVX2() && VT.is256BitVector()) || + (Subtarget.hasBWI() && VT.is512BitVector())); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + SmallVector<SDValue, 64> PSHUFBMask(NumBytes); + // Sign bit set in i8 mask means zero element. + SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); + + for (int i = 0; i < NumBytes; ++i) { + int M = Mask[i / NumEltBytes]; + if (M < 0) { + PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); + continue; + } + if (Zeroable[i / NumEltBytes]) { + PSHUFBMask[i] = ZeroMask; + continue; + } + // Only allow V1. + if (M >= Size) + return SDValue(); + + // PSHUFB can't cross lanes, ensure this doesn't happen. + if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) + return SDValue(); + + M = M % LaneSize; + M = M * NumEltBytes + (i % NumEltBytes); + PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); + } + + MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1), + DAG.getBuildVector(I8VT, DL, PSHUFBMask))); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. -static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); - SmallVector<int, 8> Unpckl; - SmallVector<int, 8> Unpckh; + SmallVector<int, 8> Unpckl(NumElts); + SmallVector<int, 8> Unpckh(NumElts); for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); int HiPos = LoPos + NumEltsInLane / 2; - Unpckl.push_back(LoPos); - Unpckh.push_back(HiPos); + Unpckl[i] = LoPos; + Unpckh[i] = HiPos; } if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) @@ -7013,7 +7391,7 @@ static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { MVT EltVT = VT.getVectorElementType(); @@ -7044,7 +7422,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); V = DAG.getNode(VT.isFloatingPoint() ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, DL, VT, V, VMask); @@ -7056,7 +7434,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. -static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); @@ -7067,12 +7445,12 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, EltVT); SmallVector<SDValue, 16> MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) + if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } - SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); + SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); // We have to cast V2 around. MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); @@ -7088,9 +7466,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Original, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); @@ -7153,13 +7531,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, case MVT::v4i64: case MVT::v8i32: - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); // FALLTHROUGH case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. - if (Subtarget->hasAVX2()) { + if (Subtarget.hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); @@ -7184,14 +7562,14 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } case MVT::v16i16: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) - if (RepeatedMask[i] >= 16) + if (RepeatedMask[i] >= 8) BlendMask |= 1u << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); @@ -7200,7 +7578,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget->hasAVX2()) && + assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. @@ -7235,10 +7613,9 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getNode(ISD::BUILD_VECTOR, DL, - BlendVT, VSELECTMask), - V1, V2)); + return DAG.getBitcast( + VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } default: @@ -7251,8 +7628,8 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. -static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, +static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { // We build up the blend mask while checking whether a blend is a viable way @@ -7266,7 +7643,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); - if (BlendMask[Mask[i] % Size] == -1) + if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! @@ -7285,8 +7662,8 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, - SDValue V1, +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, + MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { @@ -7335,10 +7712,10 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, +static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); @@ -7357,9 +7734,8 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue Lo, Hi; for (int l = 0; l < NumElts; l += NumLaneElts) { for (int i = 0; i < NumLaneElts; ++i) { - if (Mask[l + i] == -1) + if (Mask[l + i] < 0) continue; - assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); // Get the mod-Size index and lane correct it. int LaneIdx = (Mask[l + i] % NumElts) - l; @@ -7411,19 +7787,22 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, else if (!Hi) Hi = Lo; + // Cast the inputs to i8 vector of correct length to match PALIGNR or + // PSLLDQ/PSRLDQ. + MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); + Lo = DAG.getBitcast(ByteVT, Lo); + Hi = DAG.getBitcast(ByteVT, Hi); + // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int Scale = 16 / NumLaneElts; // SSSE3 targets can use the palignr instruction. - if (Subtarget->hasSSSE3()) { - // Cast the inputs to i8 vector of correct length to match PALIGNR. - MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); - Lo = DAG.getBitcast(AlignVT, Lo); - Hi = DAG.getBitcast(AlignVT, Hi); - + if (Subtarget.hasSSSE3()) { + assert((!VT.is512BitVector() || Subtarget.hasBWI()) && + "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, + VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } @@ -7431,21 +7810,19 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); + assert(ByteVT == MVT::v16i8 && + "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; - // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. - Lo = DAG.getBitcast(MVT::v2i64, Lo); - Hi = DAG.getBitcast(MVT::v2i64, Hi); - - SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); - SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, - DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); + DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). @@ -7471,8 +7848,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] -static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); @@ -7510,7 +7888,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); - MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); + MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8) + : MVT::getVectorVT(ShiftSVT, Size / Scale); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); @@ -7526,7 +7905,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. - for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) + unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128); + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) @@ -7539,7 +7919,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, } /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); @@ -7679,8 +8059,8 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, - ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, + ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); @@ -7713,14 +8093,20 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { // Not worth offseting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); + InputV = ShuffleOffset(InputV); + + // For 256-bit vectors, we only need the lower (128-bit) input half. + if (VT.is256BitVector()) + InputV = extract128BitVector(InputV, 0, DAG, DL); + + InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); return DAG.getBitcast(VT, InputV); } @@ -7752,33 +8138,33 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. - if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { + if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; - SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(LoIdx, DL, MVT::i8))); + SDValue Lo = DAG.getBitcast( + MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(LoIdx, DL, MVT::i8))); if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || !SafeOffset(Offset + 1)) - return DAG.getNode(ISD::BITCAST, DL, VT, Lo); + return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; - SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(HiIdx, DL, MVT::i8))); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); + SDValue Hi = DAG.getBitcast( + MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(HiIdx, DL, MVT::i8))); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. - if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { @@ -7787,10 +8173,9 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, - DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v16i8, PSHUFBMask))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that @@ -7837,8 +8222,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( - SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); @@ -7858,7 +8243,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; - if (M == -1) + if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. @@ -7960,8 +8345,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - while (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); + V = peekThroughBitcasts(V); + // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); @@ -7974,7 +8359,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) - return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); + return DAG.getBitcast(EltVT, S); } return SDValue(); @@ -7985,9 +8370,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { - while (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - + V = peekThroughBitcasts(V); return ISD::isNON_EXTLoad(V.getNode()); } @@ -7996,8 +8379,8 @@ static bool isShuffleFoldableLoad(SDValue V) { /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( - SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); @@ -8054,7 +8437,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( // This is essentially a special case blend operation, but if we have // general purpose blend operations, they are always faster. Bail and let // the rest of the lowering handle these as blends. - if (Subtarget->hasSSE41()) + if (Subtarget.hasSSE41()) return SDValue(); // Otherwise, use MOVSD or MOVSS. @@ -8082,9 +8465,9 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { - V2 = DAG.getBitcast(MVT::v2i64, V2); + V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode( - X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, DAG.getTargetLoweringInfo().getScalarShiftAmountTy( DAG.getDataLayout(), VT))); @@ -8094,15 +8477,15 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } -/// \brief Try to lower broadcast of a single - truncated - integer element, +/// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. -static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, - int BroadcastIdx, - const X86Subtarget *Subtarget, +static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, + SDValue V0, int BroadcastIdx, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget->hasAVX2() && + assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); @@ -8153,38 +8536,57 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? -static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, +static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasAVX()) - return SDValue(); - if (VT.isInteger() && !Subtarget->hasAVX2()) + if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || + (Subtarget.hasAVX() && VT.isFloatingPoint()) || + (Subtarget.hasAVX2() && VT.isInteger()))) return SDValue(); + // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise + // we can only broadcast from a register with AVX2. + unsigned NumElts = Mask.size(); + unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); + // Check that the mask is a broadcast. int BroadcastIdx = -1; - for (int M : Mask) - if (M >= 0 && BroadcastIdx == -1) - BroadcastIdx = M; - else if (M >= 0 && M != BroadcastIdx) - return SDValue(); + for (int i = 0; i != (int)NumElts; ++i) { + SmallVector<int, 8> BroadcastMask(NumElts, i); + if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { + BroadcastIdx = i; + break; + } + } + if (BroadcastIdx < 0) + return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + SDValue V = V1; for (;;) { switch (V.getOpcode()) { + case ISD::BITCAST: { + SDValue VSrc = V.getOperand(0); + MVT SrcVT = VSrc.getSimpleValueType(); + if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits()) + break; + V = VSrc; + continue; + } case ISD::CONCAT_VECTORS: { int OperandSize = Mask.size() / V.getNumOperands(); V = V.getOperand(BroadcastIdx / OperandSize); BroadcastIdx %= OperandSize; continue; } - case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); @@ -8219,45 +8621,76 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, MVT BroadcastVT = VT; // Peek through any bitcast (only useful for loads). - SDValue BC = V; - while (BC.getOpcode() == ISD::BITCAST) - BC = BC.getOperand(0); + SDValue BC = peekThroughBitcasts(V); // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); - // If the scalar isn't a load, we can't broadcast from it in AVX1. - // Only AVX2 has register broadcasts. - if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + // If we can't broadcast from a register, check that the input is a load. + if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); + } // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast<LoadSDNode>(BC); SDValue BaseAddr = Ld->getOperand(1); - EVT AddrVT = BaseAddr.getValueType(); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); - SDValue NewAddr = DAG.getNode( - ISD::ADD, DL, AddrVT, BaseAddr, - DAG.getConstant(Offset, DL, AddrVT)); + SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); - } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { - // We can't broadcast from a vector register without AVX2, and we can only - // broadcast from the zero-element of a vector register. + } else if (!BroadcastFromReg) { + // We can't broadcast from a vector register. return SDValue(); + } else if (BroadcastIdx != 0) { + // We can only broadcast from the zero-element of a vector register, + // but it can be advantageous to broadcast from the zero-element of a + // subvector. + if (!VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. + if (VT == MVT::v4f64 || VT == MVT::v4i64) + return SDValue(); + + // Only broadcast the zero-element of a 128-bit subvector. + unsigned EltSize = VT.getScalarSizeInBits(); + if (((BroadcastIdx * EltSize) % 128) != 0) + return SDValue(); + + MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, + DAG.getIntPtrConstant(BroadcastIdx, DL)); } - V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); - return DAG.getBitcast(VT, V); + if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, + DAG.getBitcast(MVT::f64, V)); + + // Bitcast back to the same scalar type as BroadcastVT. + MVT SrcVT = V.getSimpleValueType(); + if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { + assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + "Unexpected vector element size"); + if (SrcVT.isVector()) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); + } else { + SrcVT = BroadcastVT.getScalarType(); + } + V = DAG.getBitcast(SrcVT, V); + } + + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -8266,16 +8699,14 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG) { - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); +static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); + assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; @@ -8295,8 +8726,8 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, } // We can only insert a single non-zeroable element. - if (V1DstIndex != -1 || V2DstIndex != -1) - return SDValue(); + if (V1DstIndex >= 0 || V2DstIndex >= 0) + return false; if (Mask[i] < 4) { // V1 input out of place for insertion. @@ -8308,13 +8739,13 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, } // Don't bother if we have no (non-zeroable) element for insertion. - if (V1DstIndex == -1 && V2DstIndex == -1) - return SDValue(); + if (V1DstIndex < 0 && V2DstIndex < 0) + return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned V2SrcIndex = 0; - if (V1DstIndex != -1) { + if (V1DstIndex >= 0) { // If we have a V1 input out of place, we use V1 as the V2 element insertion // and don't use the original V2 at all. V2SrcIndex = Mask[V1DstIndex]; @@ -8329,11 +8760,25 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); - unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + // Insert the V2 element into the desired position. + InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + return true; +} + +static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + // Attempt to match the insertps pattern. + unsigned InsertPSMask; + if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + return SDValue(); // Insert the V2 element into the desired position. - SDLoc DL(Op); return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } @@ -8347,29 +8792,30 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, +static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); - assert(!isSingleInputShuffleMask(Mask) && + assert(VT.is128BitVector() && + "This routine only works on 128-bit vectors."); + assert(!V2.isUndef() && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); - int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { - return M >= 0 && M % Size < Size / 2; - }); - int NumHiInputs = std::count_if( - Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); + int NumLoInputs = + count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); + int NumHiInputs = + count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; - auto TryUnpack = [&](MVT UnpackVT, int Scale) { - SmallVector<int, 32> V1Mask(Mask.size(), -1); - SmallVector<int, 32> V2Mask(Mask.size(), -1); + auto TryUnpack = [&](int ScalarSize, int Scale) { + SmallVector<int, 16> V1Mask((unsigned)Size, -1); + SmallVector<int, 16> V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) @@ -8401,6 +8847,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. + MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); @@ -8412,15 +8859,10 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, // We try each unpack from the largest to the smallest to try and find one // that fits this mask. - int OrigNumElements = VT.getVectorNumElements(); int OrigScalarSize = VT.getScalarSizeInBits(); - for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { - int Scale = ScalarSize / OrigScalarSize; - int NumElements = OrigNumElements / Scale; - MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); - if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) + for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) + if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; - } // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. @@ -8434,8 +8876,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, // half-crossings are created. // FIXME: We could consider commuting the unpacks. - SmallVector<int, 32> PermMask; - PermMask.assign(Size, -1); + SmallVector<int, 32> PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; @@ -8461,28 +8902,25 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. -static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); - if (isSingleInputShuffleMask(Mask)) { - // Use low duplicate instructions for masks that match their pattern. - if (Subtarget->hasSSE3()) - if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) - return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); + if (V2.isUndef()) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); - if (Subtarget->hasAVX()) { + if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, @@ -8521,7 +8959,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); - if (Subtarget->hasSSE41()) + if (Subtarget.hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -8542,21 +8980,18 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. -static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); - if (isSingleInputShuffleMask(Mask)) { + if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -8576,28 +9011,29 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); - // If we have a blend of two PACKUS operations an the blend aligns with the - // low and half halves, we can just merge the PACKUS operations. This is - // particularly important as it lets us merge shuffles that this routine itself - // creates. + // If we have a blend of two same-type PACKUS operations and the blend aligns + // with the low and high halves, we can just merge the PACKUS operations. + // This is particularly important as it lets us merge shuffles that this + // routine itself creates. auto GetPackNode = [](SDValue V) { - while (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - + V = peekThroughBitcasts(V); return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); }; if (SDValue V1Pack = GetPackNode(V1)) - if (SDValue V2Pack = GetPackNode(V2)) - return DAG.getBitcast(MVT::v2i64, - DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, - Mask[0] == 0 ? V1Pack.getOperand(0) - : V1Pack.getOperand(1), - Mask[1] == 2 ? V2Pack.getOperand(0) - : V2Pack.getOperand(1))); + if (SDValue V2Pack = GetPackNode(V2)) { + EVT PackVT = V1Pack.getValueType(); + if (PackVT == V2Pack.getValueType()) + return DAG.getBitcast(MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, PackVT, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); + } // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do @@ -8614,7 +9050,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // We have different paths for blend lowering, but they all must use the // *exact* same predicate. - bool IsBlendSupported = Subtarget->hasSSE41(); + bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) @@ -8627,7 +9063,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget->hasSSSE3()) + if (Subtarget.hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -8655,12 +9091,16 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); + assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. - if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) + if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; - if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) + if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; @@ -8671,14 +9111,13 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. -static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, +static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = @@ -8689,7 +9128,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, // the low bit. int V2AdjIndex = V2Index ^ 1; - if (Mask[V2AdjIndex] == -1) { + if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. @@ -8761,35 +9200,31 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. - if (Subtarget->hasSSE3()) { + if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } - if (Subtarget->hasAVX()) { + if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, @@ -8812,13 +9247,13 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return V; - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. - if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) @@ -8827,6 +9262,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return BlendPerm; } + // Use low/high mov instructions. + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) + return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) + return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) @@ -8840,15 +9281,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. -static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster @@ -8858,13 +9296,12 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return ZExt; - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -8884,8 +9321,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. @@ -8896,7 +9333,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // We have different paths for blend lowering, but they all must use the // *exact* same predicate. - bool IsBlendSupported = Subtarget->hasSSE41(); + bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) @@ -8913,7 +9350,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget->hasSSSE3()) + if (Subtarget.hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -8957,8 +9394,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( - SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); @@ -8987,6 +9424,26 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // If we are splatting two values from one half - one to each half, then + // we can shuffle that half so each is splatted to a dword, then splat those + // to their respective halves. + auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, + int DOffset) { + int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; + int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; + V = DAG.getNode(ShufWOp, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); + V = DAG.getBitcast(PSHUFDVT, V); + V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); + return DAG.getBitcast(VT, V); + }; + + if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) + return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); + if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) + return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -9096,9 +9553,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) - if (M != -1 && M == FixIdx) + if (M >= 0 && M == FixIdx) M = FixFreeIdx; - else if (M != -1 && M == FixFreeIdx) + else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { @@ -9123,9 +9580,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Adjust the mask to match the new locations of A and B. for (int &M : Mask) - if (M != -1 && M/2 == ADWord) + if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; - else if (M != -1 && M/2 == BDWord) + else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't @@ -9194,7 +9651,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { - return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, int Word) { @@ -9213,7 +9670,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { - if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. @@ -9234,7 +9691,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( } // Map the input's dword into the correct half. - if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == @@ -9280,17 +9737,17 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && - SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0] ^ 1] < 0) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && - SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1] ^ 1] < 0) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; - } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && - SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. @@ -9304,7 +9761,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) - assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); @@ -9338,8 +9795,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( } // Now hoist the DWord down to the right half. - int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; - assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) @@ -9367,11 +9824,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. - assert(std::count_if(LoMask.begin(), LoMask.end(), - [](int M) { return M >= 4; }) == 0 && + assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); - assert(std::count_if(HiMask.begin(), HiMask.end(), - [](int M) { return M >= 0 && M < 4; }) == 0 && + assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. @@ -9390,11 +9845,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( return V; } -/// \brief Helper to form a PSHUFB-based shuffle+blend. -static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG, bool &V1InUse, - bool &V2InUse) { +/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the +/// blend if only one input is used. +static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V1Mask[16]; SDValue V2Mask[16]; @@ -9404,7 +9859,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, int Size = Mask.size(); int Scale = 16 / Size; for (int i = 0; i < 16; ++i) { - if (Mask[i / Scale] == -1) { + if (Mask[i / Scale] < 0) { V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { const int ZeroMask = 0x80; @@ -9425,11 +9880,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V1), - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + DAG.getBuildVector(MVT::v16i8, DL, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V2), - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + DAG.getBuildVector(MVT::v16i8, DL, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; @@ -9454,42 +9909,31 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. -static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> OrigMask = SVOp->getMask(); - int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], - OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; - MutableArrayRef<int> Mask(MaskStorage); - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return ZExt; - auto isV1 = [](int M) { return M >= 0 && M < 8; }; - (void)isV1; - auto isV2 = [](int M) { return M >= 8; }; - - int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, + Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. @@ -9502,21 +9946,24 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return Rotate; - return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, - Subtarget, DAG); + // Make a copy of the mask so it can be modified. + SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end()); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, + MutableMask, Subtarget, + DAG); } - assert(std::any_of(Mask.begin(), Mask.end(), isV1) && + assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. - if (Subtarget->hasSSE4A()) + if (Subtarget.hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) return V; @@ -9528,7 +9975,7 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // We have different paths for blend lowering, but they all must use the // *exact* same predicate. - bool IsBlendSupported = Subtarget->hasSSE41(); + bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -9552,16 +9999,17 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; + // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. - if (!IsBlendSupported && Subtarget->hasSSSE3()) { + if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, - V1InUse, V2InUse); + return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG, + V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to @@ -9591,10 +10039,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. -static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { - // Figure out whether we're looping over two inputs or just one. - bool IsSingleInput = isSingleInputShuffleMask(Mask); - +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, + bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); @@ -9611,7 +10057,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. - if (Mask[i] == -1) + if (Mask[i] < 0) continue; bool IsAnyViable = false; @@ -9645,20 +10091,17 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. -static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -9672,18 +10115,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return ZExt; // See if we can use SSE4A Extraction / Insertion. - if (Subtarget->hasSSE4A()) + if (Subtarget.hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) return V; - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Check whether we can widen this to an i16 shuffle by duplicating bytes. @@ -9696,7 +10138,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef<int> Mask) { for (int i = 0; i < 16; i += 2) - if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) + if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) return false; return true; @@ -9734,7 +10176,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. - while (j < je && PreDupI16Shuffle[j] != -1) + while (j < je && PreDupI16Shuffle[j] >= 0) ++j; if (j == je) @@ -9759,10 +10201,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) - if (Mask[i] != -1) { + if (Mask[i] >= 0) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); - if (PostDupI16Shuffle[i / 2] == -1) + if (PostDupI16Shuffle[i / 2] < 0) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && @@ -9799,18 +10241,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. - if (Subtarget->hasSSSE3()) { + if (Subtarget.hasSSSE3()) { bool V1InUse = false; bool V2InUse = false; - SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, - DAG, V1InUse, V2InUse); + SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( + DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { - if (Subtarget->hasSSE41()) + if (Subtarget.hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -9848,11 +10290,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. - if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { + bool IsSingleInput = V2.isUndef(); + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. - bool IsSingleInput = isSingleInputShuffleMask(Mask); // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && @@ -9907,7 +10349,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, - DAG.getConstant(0x00FF, DL, MVT::v8i16)); + DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); @@ -9938,22 +10380,23 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. -static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, - MVT VT, const X86Subtarget *Subtarget, +static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: - return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v2f64: - return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v4i32: - return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v4f32: - return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v8i16: - return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v16i8: - return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); @@ -9971,21 +10414,22 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef<int> Mask, SmallVectorImpl<int> &WidenedMask) { + WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { // If both elements are undef, its trivial. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { - WidenedMask.push_back(SM_SentinelUndef); + WidenedMask[i/2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { - WidenedMask.push_back(Mask[i + 1] / 2); + WidenedMask[i/2] = Mask[i + 1] / 2; continue; } if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask.push_back(Mask[i] / 2); + WidenedMask[i/2] = Mask[i] / 2; continue; } @@ -9993,7 +10437,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { - WidenedMask.push_back(SM_SentinelZero); + WidenedMask[i/2] = SM_SentinelZero; continue; } return false; @@ -10002,7 +10446,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, // Finally check if the two mask values are adjacent and aligned with // a pair. if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { - WidenedMask.push_back(Mask[i] / 2); + WidenedMask[i/2] = Mask[i] / 2; continue; } @@ -10020,7 +10464,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. -static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, +static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && @@ -10039,8 +10483,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, // Rather than splitting build-vectors, just build two narrower build // vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { - while (V.getOpcode() == ISD::BITCAST) - V = V->getOperand(0); + V = peekThroughBitcasts(V); MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); @@ -10063,8 +10506,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, LoOps.push_back(BV->getOperand(i)); HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); } - LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); - HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); + LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); + HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); } return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); @@ -10077,7 +10520,9 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef<int> HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; - SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; + SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1); + SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1); + SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1); for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { @@ -10085,21 +10530,15 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, UseHiV2 = true; else UseLoV2 = true; - V2BlendMask.push_back(M - NumElements); - V1BlendMask.push_back(-1); - BlendMask.push_back(SplitNumElements + i); + V2BlendMask[i] = M - NumElements; + BlendMask[i] = SplitNumElements + i; } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; - V2BlendMask.push_back(-1); - V1BlendMask.push_back(M); - BlendMask.push_back(i); - } else { - V2BlendMask.push_back(-1); - V1BlendMask.push_back(-1); - BlendMask.push_back(-1); + V1BlendMask[i] = M; + BlendMask[i] = i; } } @@ -10151,12 +10590,12 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. -static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, +static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, SelectionDAG &DAG) { - assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " - "lower single-input shuffles as it " - "could then recurse on itself."); + assert(!V2.isUndef() && "This routine must not be used to lower single-input " + "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, @@ -10166,12 +10605,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { - if (V2BroadcastIdx == -1) + if (V2BroadcastIdx < 0) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { - if (V1BroadcastIdx == -1) + if (V1BroadcastIdx < 0) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; @@ -10210,54 +10649,51 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. -static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); - int LaneSize = Mask.size() / 2; + int Size = Mask.size(); + int LaneSize = Size / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); - if (isSingleInputShuffleMask(Mask)) { - SmallVector<int, 32> FlippedBlendMask; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - FlippedBlendMask.push_back( - Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) - ? Mask[i] - : Mask[i] % LaneSize + - (i / LaneSize) * LaneSize + Size)); - - // Flip the vector, and blend the results which should now be in-lane. The - // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and - // 5 for the high source. The value 3 selects the high half of source 2 and - // the value 2 selects the low half of source 2. We only use source 2 to - // allow folding it into a memory operand. - unsigned PERMMask = 3 | 2 << 4; - SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), - V1, DAG.getConstant(PERMMask, DL, MVT::i8)); - return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); - } - - // This now reduces to two single-input shuffles of V1 and V2 which at worst - // will be handled by the above logic and a blend of the results, much like - // other patterns in AVX. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); + assert(V2.isUndef() && + "This last part of this routine only works on single input shuffles"); + + SmallVector<int, 32> FlippedBlendMask(Size); + for (int i = 0; i < Size; ++i) + FlippedBlendMask[i] = + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, DL, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } /// \brief Handle lowering 2-lane 128-bit shuffles. -static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, +static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the @@ -10278,6 +10714,10 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding. + if (Subtarget.hasAVX2() && V2.isUndef()) + return SDValue(); + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, @@ -10349,10 +10789,9 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, /// in x86 only floating point has interesting non-repeating shuffles, and even /// those are still *marginally* more expensive. static SDValue lowerVectorShuffleByMerging128BitLanes( - SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(!isSingleInputShuffleMask(Mask) && - "This is only useful with multiple inputs."); + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(!V2.isUndef() && "This is only useful with multiple inputs."); int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); @@ -10361,10 +10800,8 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also // check whether the in-128-bit lane shuffles share a repeating pattern. - SmallVector<int, 4> Lanes; - Lanes.resize(NumLanes, -1); - SmallVector<int, 4> InLaneMask; - InLaneMask.resize(LaneSize, -1); + SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); + SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; @@ -10392,8 +10829,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( // First shuffle the lanes into place. MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, VT.getSizeInBits() / 64); - SmallVector<int, 8> LaneMask; - LaneMask.resize(NumLanes * 2, -1); + SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); for (int i = 0; i < NumLanes; ++i) if (Lanes[i] >= 0) { LaneMask[2 * i + 0] = 2*Lanes[i] + 0; @@ -10408,8 +10844,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( LaneShuffle = DAG.getBitcast(VT, LaneShuffle); // Now do a simple shuffle that isn't lane crossing. - SmallVector<int, 8> NewMask; - NewMask.resize(Size, -1); + SmallVector<int, 8> NewMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; @@ -10422,11 +10857,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( /// Lower shuffles where an entire half of a 256-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. -static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const X86Subtarget *Subtarget, +static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + assert(VT.is256BitVector() && "Expected 256-bit vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; @@ -10457,21 +10893,16 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, DAG.getIntPtrConstant(HalfNumElts, DL)); } - // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. - if (UndefLower && Subtarget->hasAVX2() && - (VT == MVT::v4f64 || VT == MVT::v4i64)) - return SDValue(); - - // If the shuffle only uses the lower halves of the input operands, + // If the shuffle only uses two of the four halves of the input operands, // then extract them and perform the 'half' shuffle at half width. // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> int HalfIdx1 = -1, HalfIdx2 = -1; - SmallVector<int, 8> HalfMask; + SmallVector<int, 8> HalfMask(HalfNumElts); unsigned Offset = UndefLower ? HalfNumElts : 0; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + Offset]; if (M < 0) { - HalfMask.push_back(M); + HalfMask[i] = M; continue; } @@ -10479,23 +10910,18 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; - // Only shuffle using the lower halves of the inputs. - // TODO: Investigate usefulness of shuffling with upper halves. - if (HalfIdx != 0 && HalfIdx != 2) - return SDValue(); - // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. - if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { - HalfMask.push_back(HalfElt); + if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { + HalfMask[i] = HalfElt; HalfIdx1 = HalfIdx; continue; } - if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { - HalfMask.push_back(HalfElt + HalfNumElts); + if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { + HalfMask[i] = HalfElt + HalfNumElts; HalfIdx2 = HalfIdx; continue; } @@ -10505,6 +10931,33 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, } assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + // Only shuffle the halves of the inputs when useful. + int NumLowerHalves = + (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); + int NumUpperHalves = + (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); + + // uuuuXXXX - don't extract uppers just to insert again. + if (UndefLower && NumUpperHalves != 0) + return SDValue(); + + // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. + if (UndefUpper && NumUpperHalves == 2) + return SDValue(); + + // AVX2 - XXXXuuuu - always extract lowers. + if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (VT == MVT::v4f64 || VT == MVT::v4i64) + return SDValue(); + // AVX2 supports variable 32-bit element cross-lane shuffles. + if (VT == MVT::v8f32 || VT == MVT::v8i32) { + // XXXXuuuu - don't extract lowers and uppers. + if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) + return SDValue(); + } + } + auto GetHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); @@ -10536,7 +10989,177 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { return true; } -static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, +/// Handle case where shuffle sources are coming from the same 128-bit lane and +/// every lane can be represented as the same repeating mask - allowing us to +/// shuffle the sources with the repeating shuffle and then permute the result +/// to the destination lanes. +static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + + // On AVX2 we may be able to just shuffle the lowest elements and then + // broadcast the result. + if (Subtarget.hasAVX2()) { + for (unsigned BroadcastSize : {16, 32, 64}) { + if (BroadcastSize <= VT.getScalarSizeInBits()) + continue; + int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); + + // Attempt to match a repeating pattern every NumBroadcastElts, + // accounting for UNDEFs but only references the lowest 128-bit + // lane of the inputs. + auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) { + for (int i = 0; i != NumElts; i += NumBroadcastElts) + for (int j = 0; j != NumBroadcastElts; ++j) { + int M = Mask[i + j]; + if (M < 0) + continue; + int &R = RepeatMask[j]; + if (0 != ((M % NumElts) / NumLaneElts)) + return false; + if (0 <= R && R != M) + return false; + R = M; + } + return true; + }; + + SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1); + if (!FindRepeatingBroadcastMask(RepeatMask)) + continue; + + // Shuffle the (lowest) repeated elements in place for broadcast. + SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); + + // Shuffle the actual broadcast. + SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1); + for (int i = 0; i != NumElts; i += NumBroadcastElts) + for (int j = 0; j != NumBroadcastElts; ++j) + BroadcastMask[i + j] = j; + return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), + BroadcastMask); + } + } + + // Bail if the shuffle mask doesn't cross 128-bit lanes. + if (!is128BitLaneCrossingShuffleMask(VT, Mask)) + return SDValue(); + + // Bail if we already have a repeated lane shuffle mask. + SmallVector<int, 8> RepeatedShuffleMask; + if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) + return SDValue(); + + // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes + // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. + int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; + int NumSubLanes = NumLanes * SubLaneScale; + int NumSubLaneElts = NumLaneElts / SubLaneScale; + + // Check that all the sources are coming from the same lane and see if we can + // form a repeating shuffle mask (local to each sub-lane). At the same time, + // determine the source sub-lane for each destination sub-lane. + int TopSrcSubLane = -1; + SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1); + SmallVector<int, 8> RepeatedSubLaneMasks[2] = { + SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef), + SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)}; + + for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { + // Extract the sub-lane mask, check that it all comes from the same lane + // and normalize the mask entries to come from the first lane. + int SrcLane = -1; + SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1); + for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { + int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; + if (M < 0) + continue; + int Lane = (M % NumElts) / NumLaneElts; + if ((0 <= SrcLane) && (SrcLane != Lane)) + return SDValue(); + SrcLane = Lane; + int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); + SubLaneMask[Elt] = LocalM; + } + + // Whole sub-lane is UNDEF. + if (SrcLane < 0) + continue; + + // Attempt to match against the candidate repeated sub-lane masks. + for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { + auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) { + for (int i = 0; i != NumSubLaneElts; ++i) { + if (M1[i] < 0 || M2[i] < 0) + continue; + if (M1[i] != M2[i]) + return false; + } + return true; + }; + + auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; + if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) + continue; + + // Merge the sub-lane mask into the matching repeated sub-lane mask. + for (int i = 0; i != NumSubLaneElts; ++i) { + int M = SubLaneMask[i]; + if (M < 0) + continue; + assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && + "Unexpected mask element"); + RepeatedSubLaneMask[i] = M; + } + + // Track the top most source sub-lane - by setting the remaining to UNDEF + // we can greatly simplify shuffle matching. + int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; + TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); + Dst2SrcSubLanes[DstSubLane] = SrcSubLane; + break; + } + + // Bail if we failed to find a matching repeated sub-lane mask. + if (Dst2SrcSubLanes[DstSubLane] < 0) + return SDValue(); + } + assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && + "Unexpected source lane"); + + // Create a repeating shuffle mask for the entire vector. + SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1); + for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { + int Lane = SubLane / SubLaneScale; + auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; + for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { + int M = RepeatedSubLaneMask[Elt]; + if (M < 0) + continue; + int Idx = (SubLane * NumSubLaneElts) + Elt; + RepeatedMask[Idx] = M + (Lane * NumLaneElts); + } + } + SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); + + // Shuffle each source sub-lane to its destination. + SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1); + for (int i = 0; i != NumElts; i += NumSubLaneElts) { + int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; + if (SrcSubLane < 0) + continue; + for (int j = 0; j != NumSubLaneElts; ++j) + SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); + } + + return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), + SubLaneMask); +} + +static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -10571,25 +11194,24 @@ static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallVector<int, 4> WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) - return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, - DAG); + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return V; - if (isSingleInputShuffleMask(Mask)) { + if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. @@ -10597,7 +11219,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { - // Non-half-crossing single input shuffles can be lowerid with an + // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); @@ -10606,10 +11228,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // With AVX2 we have direct support for this permutation. - if (Subtarget->hasAVX2()) + if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return V; + // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); @@ -10629,19 +11257,25 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return V; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || - isShuffleMaskInputInPlace(1, Mask)))) + if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. - if (Subtarget->hasAVX2()) + if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); @@ -10653,59 +11287,53 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. -static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); SmallVector<int, 4> WidenedMask; if (canWidenShuffleElements(Mask, WidenedMask)) - return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, - DAG); + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; - // When the shuffle is mirrored between the 128-bit lanes of the unit, we can - // use lower latency instructions that will operate on both 128-bit lanes. - SmallVector<int, 2> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { - if (isSingleInputShuffleMask(Mask)) { - int PSHUFDMask[] = {-1, -1, -1, -1}; - for (int i = 0; i < 2; ++i) - if (RepeatedMask[i] >= 0) { - PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; - PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; - } + if (V2.isUndef()) { + // When the shuffle is mirrored between the 128-bit lanes of the unit, we + // can use lower latency instructions that will operate on both lanes. + SmallVector<int, 2> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + SmallVector<int, 4> PSHUFDMask; + scaleShuffleMask(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } - } - // AVX2 provides a direct instruction for permuting a single input across - // lanes. - if (isSingleInputShuffleMask(Mask)) + // AVX2 provides a direct instruction for permuting a single input across + // lanes. return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + } // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. @@ -10717,7 +11345,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -10732,14 +11360,12 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10747,7 +11373,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -10759,12 +11385,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); - if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) + if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); - if (isSingleInputShuffleMask(Mask)) + if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); @@ -10774,30 +11400,30 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we - // have already handled any direct blends. We also need to squash the - // repeated mask into a simulated v4f32 mask. - for (int i = 0; i < 4; ++i) - if (RepeatedMask[i] >= 8) - RepeatedMask[i] -= 4; + // have already handled any direct blends. return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. - if (isSingleInputShuffleMask(Mask)) { + if (V2.isUndef()) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], DL, MVT::i32); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) - return DAG.getNode( - X86ISD::VPERMILPV, DL, MVT::v8f32, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getBuildVector(MVT::v8i32, DL, VPermMask)); - if (Subtarget->hasAVX2()) - return DAG.getNode( - X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + if (Subtarget.hasAVX2()) + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10812,7 +11438,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. - if (Subtarget->hasAVX2()) + if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); @@ -10824,16 +11450,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. -static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the @@ -10847,7 +11471,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -10857,7 +11481,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); - if (isSingleInputShuffleMask(Mask)) + if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); @@ -10868,24 +11492,30 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Shift; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. - if (isSingleInputShuffleMask(Mask)) { + if (V2.isUndef()) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(Mask[i], DL, MVT::i32); - return DAG.getNode( - X86ISD::VPERMV, DL, MVT::v8i32, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -10903,16 +11533,14 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. -static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the @@ -10922,7 +11550,7 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -10936,8 +11564,8 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -10945,7 +11573,13 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; - if (isSingleInputShuffleMask(Mask)) { + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return V; + + if (V2.isUndef()) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) @@ -10960,26 +11594,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerV8I16GeneralSingleInputVectorShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } - - SDValue PSHUFBMask[32]; - for (int i = 0; i < 16; ++i) { - if (Mask[i] == -1) { - PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); - continue; - } - - int M = i < 8 ? Mask[i] : Mask[i] - 8; - assert(M >= 0 && M < 8 && "Invalid single-input mask!"); - PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); - PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); - } - return DAG.getBitcast(MVT::v16i16, - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, - DAG.getBitcast(MVT::v32i8, V1), - DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v32i8, PSHUFBMask))); } + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, + V2, Subtarget, DAG)) + return PSHUFB; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10994,16 +11614,14 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. -static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); - assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the @@ -11013,7 +11631,7 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11027,8 +11645,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; // Try to use shift instructions. - if (SDValue Shift = - lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -11036,25 +11654,21 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; - if (isSingleInputShuffleMask(Mask)) { - // There are no generalized cross-lane shuffle operations available on i8 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, - Mask, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return V; - SDValue PSHUFBMask[32]; - for (int i = 0; i < 32; ++i) - PSHUFBMask[i] = - Mask[i] < 0 - ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, - MVT::i8); + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, + DAG); - return DAG.getNode( - X86ISD::PSHUFB, DL, MVT::v32i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); - } + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, + V2, Subtarget, DAG)) + return PSHUFB; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. @@ -11071,19 +11685,14 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, - MVT VT, const X86Subtarget *Subtarget, +static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); - int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { - return M >= NumElts; - }); + int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( @@ -11101,11 +11710,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. - if (VT.isInteger() && !Subtarget->hasAVX2()) { + if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); - if (ElementBits < 32) - // No floating point type available, decompose into 128-bit vectors. + if (ElementBits < 32) { + // No floating point type available, if we can't use the bit operations + // for masking/blending then decompose into 128-bit vectors. + if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return V; + if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + return V; return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); @@ -11116,17 +11731,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, switch (VT.SimpleTy) { case MVT::v4f64: - return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v4i64: - return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v8f32: - return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v8i32: - return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v16i16: - return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v32i8: - return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); @@ -11134,21 +11749,37 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } /// \brief Try to lower a vector shuffle as a 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, - ArrayRef<int> Mask, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. - assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); + assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); SmallVector<int, 4> WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); + SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; + // Insure elements came from the same Op. + int MaxOp1Index = VT.getVectorNumElements()/2 - 1; + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + if (WidenedMask[i] == SM_SentinelUndef) + continue; + + SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1; + unsigned OpIndex = (i < Size/2) ? 0 : 1; + if (Ops[OpIndex].isUndef()) + Ops[OpIndex] = Op; + else if (Ops[OpIndex] != Op) + return SDValue(); + } + // Form a 128-bit permutation. // Convert the 64-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. @@ -11156,19 +11787,16 @@ static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, unsigned ControlBitsNum = WidenedMask.size() / 2; for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { - if (WidenedMask[i] == SM_SentinelZero) - return SDValue(); - // Use first element in place of undef mask. Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); } - return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], DAG.getConstant(PermMask, DL, MVT::i8)); } -static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, +static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -11178,23 +11806,43 @@ static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); - if (isSingleInputShuffleMask(Mask)) + if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + if (V2.isUndef()) { + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); + + if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { + // Non-half-crossing single input shuffles can be lowered with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | + ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | + ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, + DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + } + + SmallVector<int, 4> RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + } + if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Shuf128; @@ -11203,42 +11851,90 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; + // Check if the blend happens to exactly fit that of SHUFPD. + if (SDValue Op = + lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Op; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) - return Unpck; + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + + // Use even/odd duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); + if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); + + if (V2.isUndef()) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return Unpck; + + // Otherwise, fall back to a SHUFPS sequence. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); + } return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. -static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Shuf128; + if (V2.isUndef()) { + // When the shuffle is mirrored between the 128-bit lanes of the unit, we + // can use lower latency instructions that will operate on all four + // 128-bit lanes. + SmallVector<int, 2> Repeated128Mask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { + SmallVector<int, 4> PSHUFDMask; + scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); + return DAG.getBitcast( + MVT::v8i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, + DAG.getBitcast(MVT::v16i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } + + SmallVector<int, 4> Repeated256Mask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, + getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); + } + + // Try to use shift instructions. + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) + return Shift; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; @@ -11247,49 +11943,111 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) - return Unpck; + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the four 128-bit + // lanes. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (V2.isUndef()) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return V; + } + + // Try to use shift instructions. + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (Subtarget.hasBWI()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. -static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); - assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) + return V; + + // Try to use shift instructions. + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, + Subtarget, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + if (V2.isUndef()) { + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { + // As this is a single-input shuffle, the repeated mask should be + // a strictly valid v8i16 mask that we can pass through to the v8i16 + // lowering to handle even the v32 case. + return lowerV8I16GeneralSingleInputVectorShuffle( + DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); + } + } return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. -static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, +static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); - assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) + return V; + + // Try to use shift instructions. + if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, + Subtarget, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, + V2, Subtarget, DAG)) + return PSHUFB; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); @@ -11300,61 +12058,50 @@ static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, - MVT VT, const X86Subtarget *Subtarget, +static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - assert(Subtarget->hasAVX512() && + assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // Check for being able to broadcast a single element. if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) + lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; - // Dispatch to each element type for lowering. If we don't have supprot for + // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v8i64: - return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v32i16: - if (Subtarget->hasBWI()) - return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); - break; + return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v64i8: - if (Subtarget->hasBWI()) - return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); - break; + return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } - - // Otherwise fall back on splitting. - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. -static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, - MVT VT, const X86Subtarget *Subtarget, +static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - assert(Subtarget->hasAVX512() && + assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); MVT ExtVT; switch (VT.SimpleTy) { @@ -11405,7 +12152,7 @@ static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. -static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> Mask = SVOp->getMask(); @@ -11413,14 +12160,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); - SDLoc dl(Op); + SDLoc DL(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsUndef = V1.isUndef(); + bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); @@ -11440,7 +12187,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, for (int &M : NewMask) if (M >= NumElements) M = -1; - return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } // We actually see shuffles that are entirely re-arrangements of a set of @@ -11448,7 +12195,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // simple ones. Directly lower these as a buildvector of zeros. SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); if (Zeroable.all()) - return getZeroVector(VT, Subtarget, DAG, dl); + return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point @@ -11467,12 +12214,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( - VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; - for (int M : SVOp->getMask()) + for (int M : Mask) if (M < 0) ++NumUndefElements; else if (M < NumElements) @@ -11486,6 +12233,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, if (NumV2Elements > NumV1Elements) return DAG.getCommutedVectorShuffle(*SVOp); + assert(NumV1Elements > 0 && "No V1 indices"); + assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used"); + // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum @@ -11493,28 +12243,28 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; - for (int M : SVOp->getMask().slice(0, NumElements / 2)) + for (int M : Mask.slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; - if (LowV2Elements > LowV1Elements) { + if (LowV2Elements > LowV1Elements) return DAG.getCommutedVectorShuffle(*SVOp); - } else if (LowV2Elements == LowV1Elements) { + if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; - for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) - if (SVOp->getMask()[i] >= NumElements) + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= NumElements) SumV2Indices += i; - else if (SVOp->getMask()[i] >= 0) + else if (Mask[i] >= 0) SumV1Indices += i; - if (SumV2Indices < SumV1Indices) { + if (SumV2Indices < SumV1Indices) return DAG.getCommutedVectorShuffle(*SVOp); - } else if (SumV2Indices == SumV1Indices) { + if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; - for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) - if (SVOp->getMask()[i] >= NumElements) + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= NumElements) NumV2OddIndices += i % 2; - else if (SVOp->getMask()[i] >= 0) + else if (Mask[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return DAG.getCommutedVectorShuffle(*SVOp); @@ -11524,69 +12274,23 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); if (VT.is256BitVector()) - return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); if (VT.is512BitVector()) - return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); if (Is1BitVector) - return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - llvm_unreachable("Unimplemented!"); -} + return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); -// This function assumes its argument is a BUILD_VECTOR of constants or -// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is -// true. -static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, - unsigned &MaskValue) { - MaskValue = 0; - unsigned NumElems = BuildVector->getNumOperands(); - - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - // We don't handle the >2 lanes case right now. - unsigned NumLanes = (NumElems - 1) / 8 + 1; - if (NumLanes > 2) - return false; - - unsigned NumElemsInLane = NumElems / NumLanes; - - // Blend for v16i16 should be symmetric for the both lanes. - for (unsigned i = 0; i < NumElemsInLane; ++i) { - SDValue EltCond = BuildVector->getOperand(i); - SDValue SndLaneEltCond = - (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; - - int Lane1Cond = -1, Lane2Cond = -1; - if (isa<ConstantSDNode>(EltCond)) - Lane1Cond = !isNullConstant(EltCond); - if (isa<ConstantSDNode>(SndLaneEltCond)) - Lane2Cond = !isNullConstant(SndLaneEltCond); - - unsigned LaneMask = 0; - if (Lane1Cond == Lane2Cond || Lane2Cond < 0) - // Lane1Cond != 0, means we want the first argument. - // Lane1Cond == 0, means we want the second argument. - // The encoding of this argument is 0 for the first argument, 1 - // for the second. Therefore, invert the condition. - LaneMask = !Lane1Cond << i; - else if (Lane1Cond < 0) - LaneMask = !Lane2Cond << i; - else - return false; - - MaskValue |= LaneMask; - if (NumLanes == 2) - MaskValue |= LaneMask << NumElemsInLane; - } - return true; + llvm_unreachable("Unimplemented!"); } /// \brief Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); @@ -11624,7 +12328,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { return BlendOp; // Variable blends are only legal from SSE4.1 onward. - if (!Subtarget->hasSSE41()) + if (!Subtarget.hasSSE41()) return SDValue(); // Only some types will be legal on some subtargets. If we can emit a legal @@ -11637,7 +12341,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. - if (Subtarget->hasAVX2()) + if (Subtarget.hasAVX2()) return Op; return SDValue(); @@ -11645,7 +12349,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { case MVT::v8i16: case MVT::v16i16: // AVX-512 BWI and VLX features support VSELECT with i16 elements. - if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (Subtarget.hasBWI() && Subtarget.hasVLX()) return Op; // FIXME: We should custom lower this by fixing the condition and using i8 @@ -11723,7 +12427,7 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); - assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && + assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, @@ -11737,10 +12441,15 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - const TargetRegisterClass* rc = getRegClassFor(VecVT); - if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) - rc = getRegClassFor(MVT::v16i1); - unsigned MaxSift = rc->getSize()*8 - 1; + if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) { + // Use kshiftlw/rw instruction. + VecVT = MVT::v16i1; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, + DAG.getUNDEF(VecVT), + Vec, + DAG.getIntPtrConstant(0, dl)); + } + unsigned MaxSift = VecVT.getVectorNumElements() - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, @@ -11762,7 +12471,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (!isa<ConstantSDNode>(Idx)) { if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget->hasInt256() && + (VecVT.is256BitVector() && Subtarget.hasInt256() && VecVT.getVectorElementType().getSizeInBits() == 32)) { MVT MaskEltVT = @@ -11782,13 +12491,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { - - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. - Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + Vec = extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); @@ -11803,38 +12512,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(VecVT.is128BitVector() && "Unexpected vector length"); - if (Subtarget->hasSSE41()) + if (Subtarget.hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { - SDValue Vec = Op.getOperand(0); - if (isNullConstant(Op.getOperand(1))) + if (IdxVal == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getBitcast(MVT::v4i32, Vec), - Op.getOperand(1))); + DAG.getBitcast(MVT::v4i32, Vec), Idx)); + // Transform it so it match pextrw which produces a 32-bit result. MVT EltVT = MVT::i32; - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, - Op.getOperand(0), Op.getOperand(1)); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } if (VT.getSizeInBits() == 32) { - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (IdxVal == 0) return Op; // SHUFPS the element to the lowest double word, then movss. - int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; - MVT VVT = Op.getOperand(0).getSimpleValueType(); - SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), - DAG.getUNDEF(VVT), Mask); + int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 }; + Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } @@ -11843,16 +12547,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - if (isNullConstant(Op.getOperand(1))) + if (IdxVal == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; - MVT VVT = Op.getOperand(0).getSimpleValueType(); - SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), - DAG.getUNDEF(VVT), Mask); + Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } @@ -11886,7 +12588,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if (IdxVal) EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); - if (Vec.getOpcode() == ISD::UNDEF) + if (Vec.isUndef()) return EltInVec; return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } @@ -11895,6 +12597,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); @@ -11908,6 +12611,19 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, auto *N2C = cast<ConstantSDNode>(N2); unsigned IdxVal = N2C->getZExtValue(); + // If we are clearing out a element, we do this more efficiently with a + // blend shuffle than a costly integer insertion. + // TODO: would other rematerializable values (e.g. allbits) benefit as well? + // TODO: pre-SSE41 targets will tend to use bit masking - this could still + // be beneficial if we are inserting several zeros and can combine the masks. + if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { + SmallVector<int, 8> ClearMask; + for (unsigned i = 0; i != NumElts; ++i) + ClearMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); + } + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { @@ -11917,8 +12633,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. - if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || - (Subtarget->hasAVX2() && EltVT == MVT::i32)) { + if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); N2 = DAG.getIntPtrConstant(1, dl); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); @@ -11926,7 +12642,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } // Get the desired 128-bit vector chunk. - SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); + SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); @@ -11938,11 +12654,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, DAG.getConstant(IdxIn128, dl, MVT::i32)); // Insert the changed part back into the bigger vector - return Insert128BitVector(N0, V, IdxVal, DAG, dl); + return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { unsigned Opc; if (VT == MVT::v8i16) { @@ -12026,7 +12742,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. - return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); + return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } if (OpVT == MVT::v1i64 && @@ -12042,7 +12758,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. -static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); SDValue In = Op.getOperand(0); @@ -12051,15 +12767,15 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, MVT ResVT = Op.getSimpleValueType(); MVT InVT = In.getSimpleValueType(); - if (Subtarget->hasFp256()) { + if (Subtarget.hasFp256()) { if (ResVT.is128BitVector() && (InVT.is256BitVector() || InVT.is512BitVector()) && isa<ConstantSDNode>(Idx)) { - return Extract128BitVector(In, IdxVal, DAG, dl); + return extract128BitVector(In, IdxVal, DAG, dl); } if (ResVT.is256BitVector() && InVT.is512BitVector() && isa<ConstantSDNode>(Idx)) { - return Extract256BitVector(In, IdxVal, DAG, dl); + return extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); @@ -12068,9 +12784,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. -static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasAVX()) + if (!Subtarget.hasAVX()) return SDValue(); SDLoc dl(Op); @@ -12094,16 +12810,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, OpVT.is256BitVector() && SubVecVT.is128BitVector()) { auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through a bitcast to get to the load. - if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) - SubVec2 = SubVec2.getOperand(0); - + // If needed, look through bitcasts to get to the load. + SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1)); if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { bool Fast; unsigned Alignment = FirstLd->getAlignment(); unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { SDValue Ops[] = { SubVec2, SubVec }; @@ -12116,13 +12829,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && SubVecVT.is128BitVector()) - return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) - return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); if (OpVT.getVectorElementType() == MVT::i1) - return Insert1BitVector(Op, DAG); + return insert1BitVector(Op, DAG, Subtarget); return SDValue(); } @@ -12139,17 +12852,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - unsigned char OpFlag = 0; + unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); - if (Subtarget->isPICStyleRIPRel() && + if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; - else if (Subtarget->isPICStyleGOT()) - OpFlag = X86II::MO_GOTOFF; - else if (Subtarget->isPICStyleStubPIC()) - OpFlag = X86II::MO_PIC_BASE_OFFSET; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( @@ -12171,17 +12880,13 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - unsigned char OpFlag = 0; + unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); - if (Subtarget->isPICStyleRIPRel() && + if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; - else if (Subtarget->isPICStyleGOT()) - OpFlag = X86II::MO_GOTOFF; - else if (Subtarget->isPICStyleStubPIC()) - OpFlag = X86II::MO_PIC_BASE_OFFSET; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); @@ -12203,22 +12908,14 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - unsigned char OpFlag = 0; + const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); unsigned WrapperKind = X86ISD::Wrapper; CodeModel::Model M = DAG.getTarget().getCodeModel(); - if (Subtarget->isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) { - if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) - OpFlag = X86II::MO_GOTPCREL; + if (Subtarget.isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) WrapperKind = X86ISD::WrapperRIP; - } else if (Subtarget->isPICStyleGOT()) { - OpFlag = X86II::MO_GOT; - } else if (Subtarget->isPICStyleStubPIC()) { - OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; - } else if (Subtarget->isPICStyleStubNoDynamic()) { - OpFlag = X86II::MO_DARWIN_NONLAZY; - } auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); @@ -12227,8 +12924,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. - if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && - !Subtarget->is64Bit()) { + if (isPositionIndependent() && !Subtarget.is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); @@ -12238,8 +12934,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } @@ -12248,7 +12943,7 @@ SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = - Subtarget->ClassifyBlockAddressReference(); + Subtarget.classifyBlockAddressReference(); CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); @@ -12256,7 +12951,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); - if (Subtarget->isPICStyleRIPRel() && + if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else @@ -12271,13 +12966,12 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue -X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, - int64_t Offset, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, + const SDLoc &dl, int64_t Offset, + SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. - unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); + unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; @@ -12290,7 +12984,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } - if (Subtarget->isPICStyleRIPRel() && + if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else @@ -12306,8 +13000,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -12429,7 +13122,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), - MachinePointerInfo(Ptr), false, false, false, 0); + MachinePointerInfo(Ptr)); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -12464,8 +13157,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), - false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread @@ -12478,45 +13170,40 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); - // Cygwin uses emutls. - // FIXME: It may be EmulatedTLS-generic also for X86-Android. - if (Subtarget->isTargetWindowsCygwin()) + if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); + bool PositionIndependent = isPositionIndependent(); - if (Subtarget->isTargetELF()) { - if (DAG.getTarget().Options.EmulatedTLS) - return LowerToTLSEmulatedModel(GA, DAG); + if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: - if (Subtarget->is64Bit()) + if (Subtarget.is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, - Subtarget->is64Bit()); + Subtarget.is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: - return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), - DAG.getTarget().getRelocationModel() == - Reloc::PIC_); + return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), + PositionIndependent); } llvm_unreachable("Unknown TLS model."); } - if (Subtarget->isTargetDarwin()) { + if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; - unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? + unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && - !Subtarget->is64Bit(); + bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else @@ -12540,9 +13227,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); - Chain = - DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), - DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), + Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -12550,12 +13237,13 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // And our return value (tls address) is in the standard call return value // location. - unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } - if (Subtarget->isTargetKnownWindowsMSVC() || - Subtarget->isTargetWindowsGNU()) { + if (Subtarget.isTargetKnownWindowsMSVC() || + Subtarget.isTargetWindowsItanium() || + Subtarget.isTargetWindowsGNU()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -12573,21 +13261,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. - Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue TlsArray = Subtarget->is64Bit() + SDValue TlsArray = Subtarget.is64Bit() ? DAG.getIntPtrConstant(0x58, dl) - : (Subtarget->isTargetWindowsGNU() + : (Subtarget.isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = - DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, - false, false, 0); + DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { @@ -12595,13 +13282,11 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); - if (Subtarget->is64Bit()) + if (Subtarget.is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, - MachinePointerInfo(), MVT::i32, false, false, - false, 0); + MachinePointerInfo(), MVT::i32); else - IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, - false, false, 0); + IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); auto &DL = DAG.getDataLayout(); SDValue Scale = @@ -12611,8 +13296,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } - res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, - false, 0); + res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, @@ -12628,7 +13312,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("TLS not implemented for this target."); } -/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); @@ -12711,13 +13395,13 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) return Op; if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && - Subtarget->is64Bit()) { + Subtarget.is64Bit()) { return Op; } SDValue ValueToStore = Op.getOperand(0); if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && - !Subtarget->is64Bit()) + !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. @@ -12730,8 +13414,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, - false, 0); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -12789,14 +13472,13 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Ops, Op.getValueType(), MMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), - false, false, false, 0); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); } return Result; } -// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. +/// 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { // This algorithm is not obvious. Here it is what we're trying to output: @@ -12837,20 +13519,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 16); + /* Alignment = */ 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 16); + /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget->hasSSE3()) { + if (Subtarget.hasSSE3()) { // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { @@ -12865,7 +13547,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, DAG.getIntPtrConstant(0, dl)); } -// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. +/// 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -12945,10 +13627,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); - unsigned NumElts = VecIntVT.getVectorNumElements(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); - assert(NumElts <= 8 && "The size of the constant array must be fixed"); // In the #idef/#else code, we have in common: // - The vector of constants: @@ -12958,24 +13638,12 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // -- v >> 16 // Create the splat vector for 0x4b000000. - SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32); - SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, - CstLow, CstLow, CstLow, CstLow}; - SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, - makeArrayRef(&CstLowArray[0], NumElts)); + SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); // Create the splat vector for 0x53000000. - SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32); - SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, - CstHigh, CstHigh, CstHigh, CstHigh}; - SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, - makeArrayRef(&CstHighArray[0], NumElts)); + SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); // Create the right shift. - SDValue CstShift = DAG.getConstant(16, DL, MVT::i32); - SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, - CstShift, CstShift, CstShift, CstShift}; - SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, - makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; @@ -12997,9 +13665,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); } else { - SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32); - SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, - CstMask, CstMask, CstMask); + SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); @@ -13009,12 +13675,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, } // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). - SDValue CstFAdd = DAG.getConstantFP( - APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32); - SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, - CstFAdd, CstFAdd, CstFAdd, CstFAdd}; - SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, - makeArrayRef(&CstFAddArray[0], NumElts)); + SDValue VecCstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); @@ -13045,10 +13707,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, } case MVT::v4i32: case MVT::v8i32: - return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); case MVT::v16i8: case MVT::v16i16: - assert(Subtarget->hasAVX512()); + assert(Subtarget.hasAVX512()); return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } @@ -13072,8 +13734,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && - (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { + if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && + (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; @@ -13083,34 +13745,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); - if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) + if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { - SDValue WordOff = DAG.getConstant(4, dl, PtrVT); - SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); + SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo(), - false, false, 0); + StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), - OffsetSlot, MachinePointerInfo(), - false, false, 0); + OffsetSlot, MachinePointerInfo()); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Op.getOperand(0); - if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, - StackSlot, MachinePointerInfo(), - false, false, 0); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, + MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -13149,7 +13807,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, - false, false, false, 4); + /* Alignment = */ 4); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); @@ -13186,10 +13844,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64 && - (!Subtarget->is64Bit() || + (!Subtarget.is64Bit() || !isScalarFPTypeInSSEReg(TheVT)); - if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); @@ -13204,7 +13862,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (DstTy == MVT::i32 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - if (Subtarget->is64Bit() && + if (Subtarget.is64Bit() && DstTy == MVT::i64 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); @@ -13280,8 +13938,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(MF, SSFI), false, - false, 0); + MachinePointerInfo::getFixedStack(MF, SSFI)); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) @@ -13309,18 +13966,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), FistOps, DstTy, MMO); - SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, - MachinePointerInfo(), - false, false, false, 0); - SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, - DAG.getConstant(4, DL, PtrVT)); + SDValue Low32 = + DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo()); + SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); - SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, - MachinePointerInfo(), - false, false, false, 0); + SDValue High32 = + DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo()); High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { // Join High32 and Low32 into a 64-bit result. // (High32 << 32) | Low32 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); @@ -13347,7 +14001,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); @@ -13374,7 +14028,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) return SDValue(); - if (Subtarget->hasInt256()) + if (Subtarget.hasInt256()) return DAG.getNode(X86ISD::VZEXT, dl, VT, In); SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); @@ -13393,41 +14047,46 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, } static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) + if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); assert(InVT.getVectorElementType() == MVT::i1); - MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + + // Extend VT if the target is 256 or 128bit vector and VLX is not supported. + MVT ExtVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + SDValue One = DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); - if (VT.is512BitVector()) - return V; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); + SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + if (VT == ExtVT) + return SelectedVal; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); } -static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) + if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; return SDValue(); } -static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -13437,7 +14096,7 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); - if (Subtarget->hasFp256()) + if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; @@ -13447,50 +14106,32 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); - assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); - // Shift LSB to MSB and use VPMOVB2M - SKX. + // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; - if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M - ((InVT.is256BitVector() || InVT.is128BitVector()) && - InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && - Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M - // Shift packed bytes not supported natively, bitcast to dword - MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, - DAG.getBitcast(ExtVT, In), - DAG.getConstant(ShiftInx, DL, ExtVT)); - ShiftNode = DAG.getBitcast(InVT, ShiftNode); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } - if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M - ((InVT.is256BitVector() || InVT.is128BitVector()) && - InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && - Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M - - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, - DAG.getConstant(ShiftInx, DL, InVT)); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } - - // Shift LSB to MSB, extend if necessary and use TESTM. - unsigned NumElts = InVT.getVectorNumElements(); - if (InVT.getSizeInBits() < 512 && - (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || - !Subtarget->hasVLX())) { - assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); - - // TESTD/Q should be used (if BW supported we use CVT2MASK above), - // so vector should be extended to packed dword/qword. + if (InVT.getScalarSizeInBits() <= 16) { + if (Subtarget.hasBWI()) { + // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to word + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + // Use TESTD/Q, extended vector to packed dword/qword. + assert((InVT.is256BitVector() || InVT.is128BitVector()) && + "Unexpected vector type."); + unsigned NumElts = InVT.getVectorNumElements(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; @@ -13523,16 +14164,16 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb - if (Subtarget->hasAVX512()) { + if (Subtarget.hasAVX512()) { // word to byte only under BWI - if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 + if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. - if (Subtarget->hasInt256()) { + if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), @@ -13553,7 +14194,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. - if (Subtarget->hasInt256()) { + if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); SmallVector<SDValue,32> pshufbMask; @@ -13569,13 +14210,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); + SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), - &ShufMask[0]); + ShufMask); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); @@ -13611,7 +14252,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); - assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); + assert(Subtarget.hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); @@ -13621,7 +14262,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), - DAG.getUNDEF(NVT), &MaskVec[0]); + DAG.getUNDEF(NVT), MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } @@ -13639,9 +14280,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), SDLoc(Op), - FIST, StackSlot, MachinePointerInfo(), - false, false, false, 0); + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, + MachinePointerInfo()); // The node is the result. return FIST; @@ -13658,9 +14298,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), SDLoc(Op), - FIST, StackSlot, MachinePointerInfo(), - false, false, false, 0); + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, + MachinePointerInfo()); // The node is the result. return FIST; @@ -13736,10 +14375,9 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = - DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, Alignment); + SDValue Mask = DAG.getLoad( + LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -13807,7 +14445,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 16); + /* Alignment = */ 16); if (!IsF128) Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); @@ -13833,7 +14471,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - false, false, false, 16); + /* Alignment = */ 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa<ConstantFPSDNode>(Op0)) { if (!IsF128) @@ -13852,18 +14490,25 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). - SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, - DAG.getConstant(1, dl, VT)); - return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT)); + MVT OpVT = N0.getSimpleValueType(); + assert((OpVT == MVT::f32 || OpVT == MVT::f64) && + "Unexpected type for FGETSIGN"); + + // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). + MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); + SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); + Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); + Res = DAG.getZExtOrTrunc(Res, dl, VT); + Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); + return Res; } // Check whether an OR'd tree is PTEST-able. -static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); - if (!Subtarget->hasSSE41()) + if (!Subtarget.hasSSE41()) return SDValue(); if (!Op->hasOneUse()) @@ -13969,9 +14614,27 @@ static bool hasNonFlagsUse(SDValue Op) { return false; } +// Emit KTEST instruction for bit vectors on AVX-512 +static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Op.getOpcode() == ISD::BITCAST) { + auto hasKTEST = [&](MVT VT) { + unsigned SizeInBits = VT.getSizeInBits(); + return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || + (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); + }; + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = Op0.getValueType().getSimpleVT(); + if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && + hasKTEST(Op0VT)) + return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); + } + return SDValue(); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) { SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); @@ -14014,10 +14677,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { + // Emit KTEST for bit vectors + if (auto Node = EmitKTEST(Op, DAG, Subtarget)) + return Node; // Emit a CMP with 0, which is the TEST pattern. - //if (Op.getValueType() == MVT::i1) - // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, - // DAG.getConstant(0, MVT::i1)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } @@ -14071,14 +14734,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->isOne() && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget.slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget.slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -14106,18 +14769,26 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); if (!Mask.isSignedIntN(32)) // Avoid large immediates. break; - SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), - DAG.getConstant(Mask, dl, VT)); - DAG.ReplaceAllUsesWith(Op, New); - Op = New; + Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), + DAG.getConstant(Mask, dl, VT)); } break; case ISD::AND: - // If the primary and result isn't used, don't bother using X86ISD::AND, + // If the primary 'and' result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. - if (!hasNonFlagsUse(Op)) - break; + if (!hasNonFlagsUse(Op)) { + SDValue Op0 = ArithOp->getOperand(0); + SDValue Op1 = ArithOp->getOperand(1); + EVT VT = ArithOp.getValueType(); + bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); + bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; + + // But if we can combine this into an ANDN operation, then create an AND + // now and allow it to be pattern matched into an ANDN. + if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType) + break; + } // FALL THROUGH case ISD::SUB: case ISD::OR: @@ -14137,8 +14808,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { - SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); - if (EFLAGS.getNode()) + if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) return EFLAGS; } Opcode = X86ISD::OR; @@ -14190,11 +14860,15 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, } } - if (Opcode == 0) + if (Opcode == 0) { + // Emit KTEST for bit vectors + if (auto Node = EmitKTEST(Op, DAG, Subtarget)) + return Node; + // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); - + } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); @@ -14206,7 +14880,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - SDLoc dl, SelectionDAG &DAG) const { + const SDLoc &dl, SelectionDAG &DAG) const { if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG); @@ -14215,13 +14889,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Do the comparison at i32 if it's smaller, besides the Atom case. - // This avoids subregister aliasing issues. Keep the smaller reference - // if we're optimizing for size, however, as that'll allow better folding - // of memory operations. - if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && + // Only promote the compare up to I32 if it is a 16 bit operation + // with an immediate. 16 bit immediates are to be avoided. + if ((Op0.getValueType() == MVT::i16 && + (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) && !DAG.getMachineFunction().getFunction()->optForMinSize() && - !Subtarget->isAtom()) { + !Subtarget.isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); @@ -14241,7 +14914,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. - if (Subtarget->hasCMov() || + if (Subtarget.hasCMov() || Cmp.getOpcode() != X86ISD::CMP || !Cmp.getOperand(0).getValueType().isFloatingPoint() || !Cmp.getOperand(1).getValueType().isFloatingPoint()) @@ -14259,7 +14932,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); // Some 64-bit targets lack SAHF support, but they do support FCOMI. - assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); + assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -14279,10 +14952,10 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) + if (VT == MVT::f32 && Subtarget.hasSSE1()) RecipOp = "sqrtf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) + else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || + (VT == MVT::v8f32 && Subtarget.hasAVX())) RecipOp = "vec-sqrtf"; else return SDValue(); @@ -14311,10 +14984,10 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget->hasSSE1()) + if (VT == MVT::f32 && Subtarget.hasSSE1()) RecipOp = "divf"; - else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || - (VT == MVT::v8f32 && Subtarget->hasAVX())) + else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || + (VT == MVT::v8f32 && Subtarget.hasAVX())) RecipOp = "vec-divf"; else return SDValue(); @@ -14337,10 +15010,9 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } -/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node -/// if it's possible. +/// Result of 'and' is compared against zero. Change to a BT node if possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, - SDLoc dl, SelectionDAG &DAG) const { + const SDLoc &dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -14353,19 +15025,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { - // If we looked past a truncate, check that it's only truncating away - // known zeros. - unsigned BitWidth = Op0.getValueSizeInBits(); - unsigned AndBitWidth = And.getValueSizeInBits(); - if (BitWidth > AndBitWidth) { - APInt Zeros, Ones; - DAG.computeKnownBits(Op0, Zeros, Ones); - if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) - return SDValue(); - } - LHS = Op1; - RHS = Op0.getOperand(1); + // If we looked past a truncate, check that it's only truncating away + // known zeros. + unsigned BitWidth = Op0.getValueSizeInBits(); + unsigned AndBitWidth = And.getValueSizeInBits(); + if (BitWidth > AndBitWidth) { + APInt Zeros, Ones; + DAG.computeKnownBits(Op0, Zeros, Ones); + if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + return SDValue(); } + LHS = Op1; + RHS = Op0.getOperand(1); + } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); @@ -14407,8 +15079,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } -/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point -/// mask CMPs. +/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask +/// CMPs. static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1) { unsigned SSECC; @@ -14452,8 +15124,8 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, return SSECC; } -// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 -// ones, and then concatenate the result back. +/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then +/// concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -14466,13 +15138,13 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); @@ -14525,16 +15197,15 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { } } -static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && - Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); @@ -14568,8 +15239,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. -static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) -{ +static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1, + SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); if (!BV) return SDValue(); @@ -14592,10 +15263,10 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); + return DAG.getBuildVector(VT, dl, ULTOp1); } -static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -14611,32 +15282,59 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif - unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); - unsigned Opc = X86ISD::CMPP; - if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { + unsigned Opc; + if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; - } - // In the two special cases we can't handle, emit two comparisons. + } else { + Opc = X86ISD::CMPP; + // The SSE/AVX packed FP comparison nodes are defined with a + // floating-point vector result that matches the operand type. This allows + // them to work with an SSE1 target (integer vector types are not legal). + VT = Op0.getSimpleValueType(); + } + + // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), + // emit two comparisons and a logic op to tie them together. + // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is + // available. + SDValue Cmp; + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); if (SSECC == 8) { + // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { - CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + CC0 = 3; // UNORD + CC1 = 0; // EQ + CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : + static_cast<unsigned>(ISD::OR); } else { assert(SetCCOpcode == ISD::SETONE); - CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; + CC0 = 7; // ORD + CC1 = 4; // NEQ + CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : + static_cast<unsigned>(ISD::AND); } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, dl, MVT::i8)); - return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + } else { + // Handle all other FP comparisons here. + Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, dl, MVT::i8)); } - // Handle all other FP comparisons here. - return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, dl, MVT::i8)); + + // If this is SSE/AVX CMPP, bitcast the result back to integer to match the + // result type of SETCC. The bitcast is expected to be optimized away + // during combining/isel. + if (Opc == X86ISD::CMPP) + Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + + return Cmp; } MVT VTOp0 = Op0.getSimpleValueType(); @@ -14665,38 +15363,38 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // The non-AVX512 code below works under the assumption that source and // destination types are the same. - assert((Subtarget->hasAVX512() || (VT == VTOp0)) && + assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // Break 256-bit integer vector compare into smaller ones. - if (VT.is256BitVector() && !Subtarget->hasInt256()) + if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntVSETCC(Op, DAG); + // Operands are boolean (vectors of i1) MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); - bool MaskResult = (VT.getVectorElementType() == MVT::i1); - if (Subtarget->hasAVX512()) { - if (Op1.getSimpleValueType().is512BitVector() || - (Subtarget->hasBWI() && Subtarget->hasVLX()) || - (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) - return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); - + // The result is boolean, but operands are int/float + if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. - // We are not talking about 512-bit operands in this case, these - // types are illegal. - if (MaskResult && - (OpVT.getVectorElementType().getSizeInBits() < 32 && - OpVT.getVectorElementType().getSizeInBits() >= 8)) - return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + // In this case use SSE compare + bool UseAVX512Inst = + (OpVT.is512BitVector() || + OpVT.getVectorElementType().getSizeInBits() >= 32 || + (Subtarget.hasBWI() && Subtarget.hasVLX())); + + if (UseAVX512Inst) + return LowerIntVSETCC_AVX512(Op, DAG); + + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } // Lower using XOP integer comparisons. if ((VT == MVT::v16i8 || VT == MVT::v8i16 || - VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { + VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (SetCCOpcode) { @@ -14748,8 +15446,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); bool hasMinMax = - (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) - || (Subtarget->hasSSE2() && (VET == MVT::i8)); + (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) + || (Subtarget.hasSSE2() && (VET == MVT::i8)); if (hasMinMax) { switch (SetCCOpcode) { @@ -14761,7 +15459,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } } - bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); if (!MinMax && hasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: @@ -14775,10 +15473,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. - if (Subtarget->hasAVX()) + if (Subtarget.hasAVX()) break; - SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); - if (ULEOp1.getNode()) { + if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; } @@ -14801,8 +15498,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { - if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { - assert(Subtarget->hasSSE2() && "Don't know how to lower!"); + if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { + assert(Subtarget.hasSSE2() && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); @@ -14817,8 +15514,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, } else { SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); - SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, - Sign, Zero, Sign, Zero); + SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero}); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); @@ -14843,10 +15539,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, return DAG.getBitcast(VT, Result); } - if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { + if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. - assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); + assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); @@ -14899,7 +15595,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) + assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -14914,8 +15610,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { - if (VT == MVT::i1) + if (VT == MVT::i1) { + NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC, + DAG.getValueType(MVT::i1)); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); + } return NewSetCC; } } @@ -14937,16 +15636,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(CCode, dl, MVT::i8), Op0.getOperand(1)); - if (VT == MVT::i1) + if (VT == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + } return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - - ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); - return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); + if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + if (isOneConstant(Op1)) { + ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); + return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); + } + if (!isNullConstant(Op1)) { + SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1); + return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC); + } } bool isFP = Op1.getSimpleValueType().isFloatingPoint(); @@ -14958,8 +15664,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); - if (VT == MVT::i1) + if (VT == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + } return SetCC; } @@ -14978,12 +15687,15 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); - if (Op.getSimpleValueType() == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + if (Op.getSimpleValueType() == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + } return SetCC; } -// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. +/// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || @@ -15009,14 +15721,23 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { +/// Returns the "condition" node, that may be wrapped with "truncate". +/// Like this: (i1 (trunc (i8 X86ISD::SETCC))). +static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) - return false; + return V; SDValue VOp0 = V.getOperand(0); + if (VOp0.getOpcode() == ISD::AssertZext && + V.getValueSizeInBits() == + cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits()) + return VOp0.getOperand(0); + unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); - return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); + if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits))) + return V.getOperand(0); + return V; } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -15032,15 +15753,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && - ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || - (Subtarget->hasSSE1() && VT == MVT::f32)) && + ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || + (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (SSECC != 8) { - if (Subtarget->hasAVX512()) { + if (Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); @@ -15062,7 +15783,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. - if (Subtarget->hasAVX() && + if (Subtarget.hasAVX() && !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. @@ -15122,8 +15843,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (Cond.getOpcode() == ISD::SETCC) { - SDValue NewCond = LowerSETCC(Cond, DAG); - if (NewCond.getNode()) + if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; } @@ -15240,8 +15960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look past the truncate if the high bits are known zero. - if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -15302,7 +16021,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); @@ -15313,22 +16032,22 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget->hasBWI() && Subtarget->hasVLX() && + (((Subtarget.hasBWI() && Subtarget.hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || - ((Subtarget->hasBWI() && VT.is512BitVector() && + ((Subtarget.hasBWI() && VT.is512BitVector() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget->hasDQI() && Subtarget->hasVLX() && + ((Subtarget.hasDQI() && Subtarget.hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || - ((Subtarget->hasDQI() && VT.is512BitVector() && + ((Subtarget.hasDQI() && VT.is512BitVector() && VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) + if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { @@ -15352,25 +16071,35 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, } static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); + MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); - assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); + assert(SVT.getSizeInBits() > InSVT.getSizeInBits()); - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); + if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && + !(VT.is256BitVector() && Subtarget.hasInt256())) + return SDValue(); SDLoc dl(Op); + // For 256-bit vectors, we only need the lower (128-bit) half of the input. + if (VT.is256BitVector()) + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, + MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2), + In, DAG.getIntPtrConstant(0, dl)); + // SSE41 targets can use the pmovsx* instructions directly. - if (Subtarget->hasSSE41()) + if (Subtarget.hasSSE41()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. @@ -15407,7 +16136,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, return SDValue(); } -static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); @@ -15422,7 +16151,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, (VT != MVT::v16i16 || InVT != MVT::v16i8)) return SDValue(); - if (Subtarget->hasInt256()) + if (Subtarget.hasInt256()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // Optimize vectors in AVX mode @@ -15441,13 +16170,13 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, for (unsigned i = 0; i != NumElems/2; ++i) ShufMask1[i] = i; - SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); + SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1); SmallVector<int,8> ShufMask2(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask2[i] = i + NumElems/2; - SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); + SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2); MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); @@ -15458,6 +16187,157 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +// Lower truncating store. We need a special lowering to vXi1 vectors +static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + StoreSDNode *St = cast<StoreSDNode>(StOp.getNode()); + SDLoc dl(St); + EVT MemVT = St->getMemoryVT(); + assert(St->isTruncatingStore() && "We only custom truncating store."); + assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && + "Expected truncstore of i1 vector"); + + SDValue Op = St->getValue(); + MVT OpVT = Op.getValueType().getSimpleVT(); + unsigned NumElts = OpVT.getVectorNumElements(); + if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + NumElts == 16) { + // Truncate and store - everything is legal + Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op); + if (MemVT.getSizeInBits() < 8) + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op, + DAG.getIntPtrConstant(0, dl)); + return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), + St->getMemOperand()); + } + + // A subset, assume that we have only AVX-512F + if (NumElts <= 8) { + if (NumElts < 8) { + // Extend to 8-elts vector + MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8); + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT, + DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl)); + } + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op); + return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), + St->getMemOperand()); + } + // v32i8 + assert(OpVT == MVT::v32i8 && "Unexpected operand type"); + // Divide the vector into 2 parts and store each part separately + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, + DAG.getIntPtrConstant(0, dl)); + Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo); + SDValue BasePtr = St->getBasePtr(); + SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr, + St->getMemOperand()); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, + DAG.getIntPtrConstant(16, dl)); + Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi); + + SDValue BasePtrHi = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(2, dl, BasePtr.getValueType())); + + SDValue StHi = DAG.getStore(St->getChain(), dl, Hi, + BasePtrHi, St->getMemOperand()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi); +} + +static SDValue LowerExtended1BitVectorLoad(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + + LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); + SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); + assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && + "Expected i1 vector load"); + unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ? + ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + MVT VT = Op.getValueType().getSimpleVT(); + unsigned NumElts = VT.getVectorNumElements(); + + if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + NumElts == 16) { + // Load and extend - everything is legal + if (NumElts < 8) { + SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getMemOperand()); + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); + SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getMemOperand()); + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + // Finally, do a normal sign-extend to the desired register. + return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load); + } + + if (NumElts <= 8) { + // A subset, assume that we have only AVX-512F + unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); + SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getMemOperand()); + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad); + SDValue BitVec = DAG.getBitcast(MaskVT, Load); + + if (NumElts == 8) + return DAG.getNode(ExtOpcode, dl, VT, BitVec); + + // we should take care to v4i1 and v2i1 + + MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); + SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + + assert(VT == MVT::v32i8 && "Unexpected extload type"); + + SmallVector<SDValue, 2> Chains; + + SDValue BasePtr = Ld->getBasePtr(); + SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getMemOperand()); + Chains.push_back(LoadLo.getValue(1)); + + SDValue BasePtrHi = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(2, dl, BasePtr.getValueType())); + + SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), + BasePtrHi, + Ld->getMemOperand()); + Chains.push_back(LoadHi.getValue(1)); + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); + + SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo); + SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi); +} + // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise @@ -15465,7 +16345,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. -static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); @@ -15473,11 +16353,14 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, "We only custom lower integer vector sext loads."); // Nothing useful we can do without SSE2 shuffles. - assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); + assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); + if (MemVT.getScalarType() == MVT::i1) + return LowerExtended1BitVectorLoad(Op, Subtarget, DAG); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); @@ -15492,7 +16375,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { // The only way in which we have a legal 256-bit vector result but not the // integer 256-bit operations needed to directly lower a sextload is if we // have AVX1 but not AVX2. In that case, we can always emit a sextload to @@ -15508,8 +16391,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, "it must be a legal 128-bit vector " "type!"); Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); } else { assert(MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"); @@ -15522,9 +16405,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); Load = DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - Ld->getAlignment()); + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); } // Replace chain users with the new chain. @@ -15592,8 +16474,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // Perform a single load. SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), - Ld->getAlignment()); + Ld->getAlignment(), Ld->getMemOperand()->getFlags()); Chains.push_back(ScalarLoad.getValue(1)); // Create the first element type using SCALAR_TO_VECTOR in order to avoid // another round of DAGCombining. @@ -15615,7 +16496,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; @@ -15637,7 +16518,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, ShuffleVec[i * SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), ShuffleVec); // Bitcast to the requested type. Shuff = DAG.getBitcast(RegVT, Shuff); @@ -15645,9 +16526,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, return Shuff; } -// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or -// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart -// from the AND / OR. +/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes +/// each of which has no other use apart from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) @@ -15658,8 +16538,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Op.getOperand(1).hasOneUse()); } -// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and -// 1 and that the SETCC node has a single use. +/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the +/// SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; @@ -15692,8 +16572,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Inverted = true; Cond = Cond.getOperand(0); } else { - SDValue NewCond = LowerSETCC(Cond, DAG); - if (NewCond.getNode()) + if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; } } @@ -15917,8 +16796,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look pass the truncate if the high bits are known zero. - if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -15951,7 +16829,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || + bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || SplitStack; SDLoc dl(Op); @@ -15966,7 +16844,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); - bool Is64Bit = Subtarget->is64Bit(); + bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; @@ -15975,13 +16853,10 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); - EVT VT = Node->getValueType(0); - SDValue Tmp3 = Node->getOperand(2); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); - unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -15995,12 +16870,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function *F = MF.getFunction(); - - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) - if (I->hasNestAttr()) + for (const auto &A : F->args()) { + if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); + } } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); @@ -16009,16 +16883,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { - SDValue Flag; - const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); - - Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); - Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); + MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true); - Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -16047,13 +16916,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); SDLoc DL(Op); - if (!Subtarget->is64Bit() || - Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { + if (!Subtarget.is64Bit() || + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); + MachinePointerInfo(SV)); } // __va_list_tag: @@ -16064,45 +16933,45 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset - SDValue Store = DAG.getStore(Op.getOperand(0), DL, - DAG.getConstant(FuncInfo->getVarArgsGPOffset(), - DL, MVT::i32), - FIN, MachinePointerInfo(SV), false, false, 0); + SDValue Store = DAG.getStore( + Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, + MachinePointerInfo(SV)); MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); - Store = DAG.getStore(Op.getOperand(0), DL, - DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, - MVT::i32), - FIN, MachinePointerInfo(SV, 4), false, false, 0); + FIN = DAG.getMemBasePlusOffset(FIN, 4, DL); + Store = DAG.getStore( + Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, + MachinePointerInfo(SV, 4)); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); - Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, - MachinePointerInfo(SV, 8), - false, false, 0); + Store = + DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( - Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); + Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); - Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( - SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); + Store = DAG.getStore( + Op.getOperand(0), DL, RSFIN, FIN, + MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->is64Bit() && + assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNode()->getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); - if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); @@ -16132,9 +17001,9 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!Subtarget->useSoftFloat() && + assert(!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && - Subtarget->hasSSE1()); + Subtarget.hasSSE1()); } // Insert VAARG_64 node into the DAG @@ -16153,19 +17022,15 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain = VAARG.getValue(1); // Load the next argument and return it - return DAG.getLoad(ArgVT, dl, - Chain, - VAARG, - MachinePointerInfo(), - false, false, false, 0); + return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); } -static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. - assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); - if (Subtarget->isCallingConvWin64( + assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget.isCallingConvWin64( DAG.getMachineFunction().getFunction()->getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); @@ -16183,9 +17048,9 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } -// getTargetVShiftByConstNode - Handle vector element shifts where the shift -// amount is a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, +/// Handle vector element shifts where the shift amount is a constant. +/// Takes immediate version of shift as input. +static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); @@ -16214,11 +17079,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, ConstantSDNode *ND; switch(Opc) { - default: llvm_unreachable(nullptr); + default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->getOpcode() == ISD::UNDEF) { + if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } @@ -16230,7 +17095,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, case X86ISD::VSRLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->getOpcode() == ISD::UNDEF) { + if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } @@ -16242,7 +17107,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, case X86ISD::VSRAI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->getOpcode() == ISD::UNDEF) { + if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } @@ -16253,16 +17118,16 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, break; } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + return DAG.getBuildVector(VT, dl, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, dl, MVT::i8)); } -// getTargetVShiftNode - Handle vector element shifts where the shift amount -// may or may not be a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, +/// Handle vector element shifts where the shift amount may or may not be a +/// constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); @@ -16288,7 +17153,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, // Let the shuffle legalizer expand this shift amount node. SDValue Op0 = ShAmt.getOperand(0); Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); - ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); + ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG); } else { // Need to build a vector containing shift amount. // SSE/AVX packed shifts only use the lower 64-bit of the shift count. @@ -16301,7 +17166,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, ShOps.push_back(DAG.getUNDEF(SVT)); MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); + ShAmt = DAG.getBuildVector(BVT, dl, ShOps); } // The return type has to be a 128-bit type with the same element @@ -16316,8 +17181,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, /// \brief Return Mask with the necessary casting or extending /// for \p Mask according to \p MaskVT when lowering masking intrinsics static SDValue getMaskNode(SDValue Mask, MVT MaskVT, - const X86Subtarget *Subtarget, - SelectionDAG &DAG, SDLoc dl) { + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl) { if (isAllOnesConstant(Mask)) return DAG.getTargetConstant(1, dl, MaskVT); @@ -16330,9 +17195,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); } - if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { if (MaskVT == MVT::v64i1) { - assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); // In case 32bit mode, bitcast i64 is illegal, extend/split it. SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, @@ -16368,7 +17233,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); @@ -16393,13 +17258,14 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: + case ISD::FP_TO_FP16: // We can't use ISD::VSELECT here because it is not always "Legal" // for the destination type. For example vpmovqb require only AVX512 // and vselect that can operate on byte element type require BWI OpcodeSelect = X86ISD::SELECT; break; } - if (PreservedSrc.getOpcode() == ISD::UNDEF) + if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } @@ -16413,7 +17279,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, /// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (isAllOnesConstant(Mask)) return Op; @@ -16429,7 +17295,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); - if (PreservedSrc.getOpcode() == ISD::UNDEF) + if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } @@ -16495,7 +17361,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -16706,6 +17572,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case VPERM_2OP_MASK : { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1), + Mask, PassThru, Subtarget, DAG); + } case VPERM_3OP_MASKZ: case VPERM_3OP_MASK:{ // Src2 is the PassThru @@ -16764,6 +17640,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case FMA_OP_SCALAR_MASK: + case FMA_OP_SCALAR_MASK3: + case FMA_OP_SCALAR_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == FMA_OP_SCALAR_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else if (IntrData->Type == FMA_OP_SCALAR_MASK3) + PassThru = Src3; + else + PassThru = Src1; + + SDValue Rnd = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, + Op.getValueType(), Src1, Src2, + Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } case TERLOG_OP_MASK: case TERLOG_OP_MASKZ: { SDValue Src1 = Op.getOperand(1); @@ -16879,49 +17779,76 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, - DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), - DAG.getValueType(MVT::i1)); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG); - assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); - SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, dl, MVT::i8), Cond); + SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS); + SDValue SetCC; + switch (CC) { + case ISD::SETEQ: { // (ZF = 0 and PF = 0) + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi); + SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NP, dl, MVT::i8), + Comi); + SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); + break; + } + case ISD::SETNE: { // (ZF = 1 or PF = 1) + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi); + SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_P, dl, MVT::i8), + Comi); + SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); + break; + } + case ISD::SETGT: // (CF = 0 and ZF = 0) + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi); + break; + case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi); + break; + } + case ISD::SETGE: // CF = 0 + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi); + break; + case ISD::SETLE: // The condition is opposite to GE. Swap the operands. + SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi); + break; + default: + llvm_unreachable("Unexpected illegal condition!"); + } return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue CC = Op.getOperand(3); + unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); SDValue Sae = Op.getOperand(4); - auto ComiType = TranslateX86ConstCondToX86CC(CC); - // choose between ordered and unordered (comi/ucomi) - unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; - SDValue Cond; - if (cast<ConstantSDNode>(Sae)->getZExtValue() != - X86::STATIC_ROUNDING::CUR_DIRECTION) - Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + + SDValue FCmp; + if (cast<ConstantSDNode>(Sae)->getZExtValue() == + X86::STATIC_ROUNDING::CUR_DIRECTION) + FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" + return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); - case VSHIFT_MASK: - return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, - Op.getSimpleValueType(), - Op.getOperand(1), - Op.getOperand(2), DAG), - Op.getOperand(4), Op.getOperand(3), Subtarget, - DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); @@ -16940,14 +17867,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } - case BLEND: { - SDValue Mask = Op.getOperand(3); - MVT VT = Op.getSimpleValueType(); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), - Op.getOperand(2)); - } case KUNPCK: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); @@ -16960,6 +17879,35 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src2, Src1); return DAG.getBitcast(VT, Res); } + case FIXUPIMMS: + case FIXUPIMMS_MASKZ: + case FIXUPIMM: + case FIXUPIMM_MASKZ:{ + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Imm = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? + Src1 : getZeroVector(VT, Subtarget, DAG, dl); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Imm, Rnd), + Mask, Passthru, Subtarget, DAG); + else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Imm, Rnd), + Mask, Passthru, Subtarget, DAG); + } case CONVERT_TO_MASK: { MVT SrcVT = Op.getOperand(1).getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); @@ -16995,6 +17943,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget subVec, subVec, immVal), Mask, Passthru, Subtarget, DAG); } + case BRCST32x2_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + + assert((VT.getScalarType() == MVT::i32 || + VT.getScalarType() == MVT::f32) && "Unexpected type!"); + //bitcast Src to packed 64 + MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64; + MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64); + Src = DAG.getBitcast(BitcastVT, Src); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); + } default: break; } @@ -17082,7 +18045,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -17163,6 +18126,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(Opcode, dl, VTs, NewOps); } + case Intrinsic::eh_sjlj_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + auto &Context = MF.getMMI().getContext(); + MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + + Twine(MF.getFunctionNumber())); + return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT)); + } + case Intrinsic::x86_seh_lsda: { // Compute the symbol for the LSDA. We know it'll get emitted later. MachineFunction &MF = DAG.getMachineFunction(); @@ -17192,7 +18165,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget // Returns one of the stack, base, or frame pointer registers, depending on // which is used to reference local variables. MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); @@ -17206,7 +18179,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget * Subtarget) { + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -17217,7 +18190,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - if (Src.getOpcode() == ISD::UNDEF) + if (Src.isUndef()) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -17237,7 +18210,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -17255,18 +18228,19 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } -// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that -// read performance monitor counters (x86_rdpmc). -static void getReadPerformanceCounter(SDNode *N, SDLoc DL, - SelectionDAG &DAG, const X86Subtarget *Subtarget, - SmallVectorImpl<SDValue> &Results) { +/// Handles the lowering of builtin intrinsics that read performance monitor +/// counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SmallVectorImpl<SDValue> &Results) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue LO, HI; @@ -17279,7 +18253,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL, // Reads the content of a 64-bit performance counter and returns it in the // registers EDX:EAX. - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); @@ -17290,7 +18264,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL, } Chain = HI.getValue(1); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { // The EAX register is loaded with the low-order 32 bits. The EDX register // is loaded with the supported high-order bits of the counter. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, @@ -17307,12 +18281,13 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL, Results.push_back(Chain); } -// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that -// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is -// also used to custom lower READCYCLECOUNTER nodes. -static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, - SelectionDAG &DAG, const X86Subtarget *Subtarget, - SmallVectorImpl<SDValue> &Results) { +/// Handles the lowering of builtin intrinsics that read the time stamp counter +/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower +/// READCYCLECOUNTER nodes. +static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SmallVectorImpl<SDValue> &Results) { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); SDValue LO, HI; @@ -17320,7 +18295,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); @@ -17341,10 +18316,10 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, // Explicitly store the content of ECX at the location passed in input // to the 'rdtscp' intrinsic. Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), - MachinePointerInfo(), false, false, 0); + MachinePointerInfo()); } - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { // The EDX register is loaded with the high-order 32 bits of the MSR, and // the EAX register is loaded with the low-order 32 bits. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, @@ -17361,7 +18336,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, Results.push_back(Chain); } -static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<SDValue, 2> Results; SDLoc DL(Op); @@ -17388,44 +18363,25 @@ static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { return Chain; } -/// \brief Lower intrinsics for TRUNCATE_TO_MEM case -/// return truncate Store/MaskedStore Node -static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, - SelectionDAG &DAG, - MVT ElementType) { - SDLoc dl(Op); - SDValue Mask = Op.getOperand(4); - SDValue DataToTruncate = Op.getOperand(3); - SDValue Addr = Op.getOperand(2); +static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); + SDValue EHGuard = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EHGuard only live in functions using WinEH"); - MVT VT = DataToTruncate.getSimpleValueType(); - MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - - if (isAllOnesConstant(Mask)) // return just a truncate store - return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, - MachinePointerInfo(), SVT, false, false, - SVT.getScalarSizeInBits()/8); - - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(MachinePointerInfo(), - MachineMemOperand::MOStore, SVT.getStoreSize(), - SVT.getScalarSizeInBits()/8); + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); + EHInfo->EHGuardFrameIndex = FINode->getIndex(); - return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, - VMask, SVT, MMO, true); + // Return the chain operand without making any DAG nodes. + return Chain; } -static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -17433,6 +18389,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (!IntrData) { if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_ehguard) + return MarkEHGuard(Op, DAG); if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || IntNo == llvm::Intrinsic::x86_flags_read_u64 || IntNo == llvm::Intrinsic::x86_flags_write_u32 || @@ -17491,7 +18449,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, - Scale, Chain, *Subtarget); + Scale, Chain, Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -17504,7 +18462,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, - *Subtarget); + Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { @@ -17532,7 +18490,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, } // ADC/ADCX/SBB case ADX: { - SmallVector<SDValue, 2> Results; SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), @@ -17540,13 +18497,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), - Op.getOperand(5), MachinePointerInfo(), - false, false, 0); + Op.getOperand(5), MachinePointerInfo()); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_B, dl, MVT::i8), Res.getValue(1)); - Results.push_back(SetCC); - Results.push_back(Store); + SDValue Results[] = { SetCC, Store }; return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { @@ -17554,48 +18509,45 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - MVT VT = DataToCompress.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, - MachinePointerInfo(), false, false, - VT.getScalarSizeInBits()/8); + MemIntr->getMemOperand()); SDValue Compressed = getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, DAG.getUNDEF(VT), Subtarget, DAG); return DAG.getStore(Chain, dl, Compressed, Addr, - MachinePointerInfo(), false, false, - VT.getScalarSizeInBits()/8); + MemIntr->getMemOperand()); } case TRUNCATE_TO_MEM_VI8: - return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); case TRUNCATE_TO_MEM_VI16: - return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); - case TRUNCATE_TO_MEM_VI32: - return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); - case EXPAND_FROM_MEM: { + case TRUNCATE_TO_MEM_VI32: { SDValue Mask = Op.getOperand(4); - SDValue PassThru = Op.getOperand(3); + SDValue DataToTruncate = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - MVT VT = Op.getSimpleValueType(); - if (isAllOnesConstant(Mask)) // return just a load - return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits()/8); + MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); - SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), - false, false, false, - VT.getScalarSizeInBits()/8); + EVT VT = MemIntr->getMemoryVT(); - SDValue Results[] = { - getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), - Mask, PassThru, Subtarget, DAG), Chain}; - return DAG.getMergeValues(Results, dl); + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT, + MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT, + MemIntr->getMemOperand(), true); } - case LOADU: - case LOADA: { + case EXPAND_FROM_MEM: { SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17605,13 +18557,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, + MemIntr->getMemOperand()); + if (isAllOnesConstant(Mask)) // return just a load - return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + return DataToExpand; - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, - MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + SDValue Results[] = { + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), + Mask, PassThru, Subtarget, DAG), Chain}; + return DAG.getMergeValues(Results, dl); } } } @@ -17630,25 +18585,24 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, dl, PtrVT, - FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), + MachinePointerInfo()); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, false, 0); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, + MachinePointerInfo()); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); EVT VT = Op.getValueType(); MFI->setFrameAddressIsTaken(true); @@ -17678,8 +18632,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), - false, false, false, 0); + MachinePointerInfo()); return FrameAddr; } @@ -17687,7 +18640,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // this table could be generated automatically from RegInfo. unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { - const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const MachineFunction &MF = DAG.getMachineFunction(); unsigned Reg = StringSwitch<unsigned>(RegName) @@ -17703,7 +18656,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, " is allocatable: function has no frame pointer"); #ifndef NDEBUG else { - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && @@ -17720,23 +18673,27 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } unsigned X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) - return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; - return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; + return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; } unsigned X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); - return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; +} + +bool X86TargetLowering::needsFixedCatchObjects() const { + return Subtarget.isTargetWin64(); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { @@ -17746,7 +18703,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -17758,8 +18715,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); - Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), - false, false, 0); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, @@ -17769,6 +18725,16 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); + // If the subtarget is not 64bit, we may need the global base reg + // after isel expand pseudo, i.e., after CGBR pass ran. + // Therefore, ask for the GlobalBaseReg now, so that the pass + // inserts the code for us in case we need it. + // Otherwise, we will end up in a situation where we will + // reference a virtual register that is not defined! + if (!Subtarget.is64Bit()) { + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); + } return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); @@ -17781,6 +18747,13 @@ SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, Op.getOperand(0), Op.getOperand(1)); } +SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, + Op.getOperand(0)); +} + static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } @@ -17794,9 +18767,9 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { SDValue OutChains[6]; // Large code-model. @@ -17812,14 +18785,13 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), - Addr, MachinePointerInfo(TrmpAddr), - false, false, 0); + Addr, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); - OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, - MachinePointerInfo(TrmpAddr, 2), - false, false, 2); + OutChains[1] = + DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), + /* Alignment = */ 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td @@ -17827,29 +18799,26 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, dl, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), - Addr, MachinePointerInfo(TrmpAddr, 10), - false, false, 0); + Addr, MachinePointerInfo(TrmpAddr, 10)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); - OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, - MachinePointerInfo(TrmpAddr, 12), - false, false, 2); + OutChains[3] = + DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), + /* Alignment = */ 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, dl, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), - Addr, MachinePointerInfo(TrmpAddr, 20), - false, false, 0); + Addr, MachinePointerInfo(TrmpAddr, 20)); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, dl, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), - Addr, MachinePointerInfo(TrmpAddr, 22), - false, false, 0); + Addr, MachinePointerInfo(TrmpAddr, 22)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { @@ -17909,29 +18878,28 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; - OutChains[0] = DAG.getStore(Root, dl, - DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), - Trmp, MachinePointerInfo(TrmpAddr), - false, false, 0); + OutChains[0] = + DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8), + Trmp, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); - OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, - MachinePointerInfo(TrmpAddr, 1), - false, false, 1); + OutChains[1] = + DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), + /* Alignment = */ 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), - false, false, 1); + /* Alignment = */ 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); - OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, - MachinePointerInfo(TrmpAddr, 6), - false, false, 1); + OutChains[3] = + DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), + /* Alignment = */ 1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } @@ -17959,7 +18927,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, */ MachineFunction &MF = DAG.getMachineFunction(); - const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -17979,8 +18947,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, Ops, MVT::i16, MMO); // Load FP Control Word from stack slot - SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, - MachinePointerInfo(), false, false, false, 0); + SDValue CWD = + DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo()); // Transform as necessary SDValue CWD1 = @@ -18014,6 +18982,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // split the vector, perform operation on it's Lo a Hi part and // concatenate the results. static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); @@ -18044,8 +19013,8 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); - Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); - Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo); + Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } @@ -18064,51 +19033,112 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } -static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +// Lower CTLZ using a PSHUFB lookup table implementation. +static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - MVT OpVT = VT; - unsigned NumBits = VT.getSizeInBits(); - SDLoc dl(Op); + int NumElts = VT.getVectorNumElements(); + int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); + MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); - if (VT.isVector() && Subtarget->hasAVX512()) - return LowerVectorCTLZ_AVX512(Op, DAG); + // Per-nibble leading zero PSHUFB lookup table. + const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, + /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, + /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; - Op = Op.getOperand(0); - if (VT == MVT::i8) { - // Zero extend to i32 since there is not an i8 bsr. - OpVT = MVT::i32; - Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + SmallVector<SDValue, 64> LUTVec; + for (int i = 0; i < NumBytes; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec); + + // Begin by bitcasting the input to byte vector, then split those bytes + // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. + // If the hi input nibble is zero then we add both results together, otherwise + // we just take the hi result (by masking the lo result to zero before the + // add). + SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); + SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); + + SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); + SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); + SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); + SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); + SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); + + Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); + Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); + Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); + SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); + + // Merge result back from vXi8 back to VT, working on the lo/hi halves + // of the current vector width in the same way we did for the nibbles. + // If the upper half of the input element is zero then add the halves' + // leading zero counts together, otherwise just use the upper half's. + // Double the width of the result until we are at target width. + while (CurrVT != VT) { + int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); + int CurrNumElts = CurrVT.getVectorNumElements(); + MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); + MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); + SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); + + // Check if the upper half of the input element is zero. + SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), + DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); + HiZ = DAG.getBitcast(NextVT, HiZ); + + // Move the upper/lower halves to the lower bits as we'll be extending to + // NextVT. Mask the lower result to zero if HiZ is true and add the results + // together. + SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); + SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); + SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); + R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); + Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); + CurrVT = NextVT; } - // Issue a bsr (scan bits in reverse) which also sets EFLAGS. - SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); - Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + return Res; +} - // If src is zero (i.e. bsr sets ZF), returns NumBits. - SDValue Ops[] = { - Op, - DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), - DAG.getConstant(X86::COND_E, dl, MVT::i8), - Op.getValue(1) - }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); +static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDValue Op0 = Op.getOperand(0); - // Finally xor with NumBits-1. - Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, - DAG.getConstant(NumBits - 1, dl, OpVT)); + if (Subtarget.hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); - if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); - return Op; + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 128-bit vector, perform ctlz and concat the result. + SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); + SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS), + DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS)); + } + + assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); + return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); } -static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - EVT OpVT = VT; + MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); + + if (VT.isVector()) + return LowerVectorCTLZ(Op, dl, Subtarget, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -18117,11 +19147,22 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } - // Issue a bsr (scan bits in reverse). + // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); - // And xor with NumBits-1. + if (Opc == ISD::CTLZ) { + // If src is zero (i.e. bsr sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), + DAG.getConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1) + }; + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); + } + + // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); @@ -18136,8 +19177,6 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); if (VT.isVector()) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue N0 = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, VT); @@ -18146,8 +19185,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); // cttz_undef(x) = (width - 1) - ctlz(lsb) - if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && - TLI.isOperationLegal(ISD::CTLZ, VT)) { + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, DAG.getNode(ISD::CTLZ, dl, VT, LSB)); @@ -18176,8 +19214,8 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } -// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit -// ones, and then concatenate the result back. +/// Break a 256-bit integer operation into two new 128-bit ones and then +/// concatenate the result back. static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -18189,13 +19227,42 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + +/// Break a 512-bit integer operation into two new 256-bit ones and then +/// concatenate the result back. +static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + assert(VT.is512BitVector() && VT.isInteger() && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + SDLoc dl(Op); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -18232,7 +19299,7 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } -static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -18241,28 +19308,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget->hasInt256()) + if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntArith(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector - // pairs, multiply and truncate. - if (VT == MVT::v16i8 || VT == MVT::v32i8) { - if (Subtarget->hasInt256()) { - if (VT == MVT::v32i8) { - MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2); - SDValue Lo = DAG.getIntPtrConstant(0, dl); - SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); - SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo); - SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo); - SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi); - SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo), - DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi)); - } + // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 + // vector pairs, multiply and truncate. + if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { + if (Subtarget.hasInt256()) { + // For 512-bit vectors, split into 256-bit vectors to allow the + // sign-extension to occur. + if (VT == MVT::v64i8) + return Lower512IntArith(Op, DAG); + + // For 256-bit vectors, split into 128-bit vectors to allow the + // sign-extension to occur. We don't need this on AVX512BW as we can + // safely sign-extend to v32i16. + if (VT == MVT::v32i8 && !Subtarget.hasBWI()) + return Lower256IntArith(Op, DAG); MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( @@ -18278,7 +19343,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); } else { @@ -18294,7 +19359,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Extract the hi parts and sign extend to i16 SDValue AHi, BHi; - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); @@ -18322,7 +19387,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { - assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && + assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmuldq is available!"); // Extract the odd parts. @@ -18386,8 +19451,122 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } +static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntArith(Op, DAG); + + // Only i8 vectors should need custom lowering after this. + assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) && + "Unsupported vector type"); + + // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, + // logical shift down the upper half and pack back to i8. + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack + // and then ashr/lshr the upper bits down to the lower bits before multiply. + unsigned Opcode = Op.getOpcode(); + unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA); + unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT); + + // AVX2 implementations - extend xmm subvectors to ymm. + if (Subtarget.hasInt256()) { + SDValue Lo = DAG.getIntPtrConstant(0, dl); + SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); + + if (VT == MVT::v32i8) { + SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo); + SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo); + SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi); + SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi); + ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo); + BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo); + AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi); + BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi); + Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16, + DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo), + DAG.getConstant(8, dl, MVT::v16i16)); + Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16, + DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi), + DAG.getConstant(8, dl, MVT::v16i16)); + // The ymm variant of PACKUS treats the 128-bit lanes separately, so before + // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane. + const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23}; + const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31}; + return DAG.getNode(X86ISD::PACKUS, dl, VT, + DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask), + DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); + } + + SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); + SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); + SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); + SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, + DAG.getConstant(8, dl, MVT::v16i16)); + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi); + return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); + } + + assert(VT == MVT::v16i8 && + "Pre-AVX2 support only supports v16i8 multiplication"); + MVT ExVT = MVT::v8i16; + + // Extract the lo parts and zero/sign extend to i16. + SDValue ALo, BLo; + if (Subtarget.hasSSE41()) { + ALo = DAG.getNode(ExSSE41, dl, ExVT, A); + BLo = DAG.getNode(ExSSE41, dl, ExVT, B); + } else { + const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, + -1, 4, -1, 5, -1, 6, -1, 7}; + ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + ALo = DAG.getBitcast(ExVT, ALo); + BLo = DAG.getBitcast(ExVT, BLo); + ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); + BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); + } + + // Extract the hi parts and zero/sign extend to i16. + SDValue AHi, BHi; + if (Subtarget.hasSSE41()) { + const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); + BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + } else { + const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, + -1, 12, -1, 13, -1, 14, -1, 15}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getBitcast(ExVT, AHi); + BHi = DAG.getBitcast(ExVT, BHi); + AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); + BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); + } + + // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and + // pack back to v16i8. + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT)); + RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); +} + SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetWin64() && "Unexpected target"); + assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); @@ -18415,8 +19594,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); Entry.Node = StackPtr; - InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), - false, false, 16); + InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, + MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.isSExt = false; @@ -18431,21 +19610,39 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, std::move(Args), 0) + Callee, std::move(Args)) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); } -static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); - assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || - (VT == MVT::v8i32 && Subtarget->hasInt256())); + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) { + unsigned Opcode = Op.getOpcode(); + unsigned NumElems = VT.getVectorNumElements(); + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2); + SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl); + SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl); + SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl); + SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl); + SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1); + SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1); + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)), + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1)) + }; + return DAG.getMergeValues(Ops, dl); + } + + assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || + (VT == MVT::v8i32 && Subtarget.hasInt256())); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. @@ -18461,16 +19658,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; // <a|b|c|d> => <b|undef|d|undef> - SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, + makeArrayRef(&Mask[0], VT.getVectorNumElements())); // <e|f|g|h> => <f|undef|h|undef> - SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, + makeArrayRef(&Mask[0], VT.getVectorNumElements())); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = - (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> // => <2 x i64> <ae|cg> SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); @@ -18494,7 +19693,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. - if (IsSigned && !Subtarget->hasSSE41()) { + if (IsSigned && !Subtarget.hasSSE41()) { SDValue ShAmt = DAG.getConstant( 31, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); @@ -18515,19 +19714,19 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget -static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, +static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { if (VT.getScalarSizeInBits() < 16) return false; if (VT.is512BitVector() && - (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) + (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; bool LShift = VT.is128BitVector() || - (VT.is256BitVector() && Subtarget->hasInt256()); + (VT.is256BitVector() && Subtarget.hasInt256()); - bool AShift = LShift && (Subtarget->hasVLX() || + bool AShift = LShift && (Subtarget.hasVLX() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -18535,24 +19734,24 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. static -bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, +bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget -static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, +static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { - if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) + if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; // vXi16 supported only on AVX-512, BWI - if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI()) + if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (VT.is512BitVector() || Subtarget->hasVLX()) + if (VT.is512BitVector() || Subtarget.hasVLX()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); @@ -18561,7 +19760,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); @@ -18611,12 +19810,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); // i64 SRA needs to be performed as partial shifts. - if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) + if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) && + Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP()) return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || - (Subtarget->hasInt256() && VT == MVT::v32i8) || + (Subtarget.hasInt256() && VT == MVT::v32i8) || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -18628,11 +19827,16 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + if (VT.is512BitVector()) { + assert(VT == MVT::v64i8 && "Unexpected element type!"); + SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R); + return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); + } return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // XOP can shift v16i8 directly instead of as shift v8i16 + mask. - if (VT == MVT::v16i8 && Subtarget->hasXOP()) + if (VT == MVT::v16i8 && Subtarget.hasXOP()) return SDValue(); if (Op.getOpcode() == ISD::SHL) { @@ -18668,8 +19872,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && + (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) { // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; @@ -18726,7 +19930,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, - const X86Subtarget* Subtarget) { + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); @@ -18746,7 +19950,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, // Check if this build_vector node is doing a splat. // If so, then set BaseShAmt equal to the splat value. BaseShAmt = BV->getSplatValue(); - if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) + if (BaseShAmt && BaseShAmt.isUndef()) BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) @@ -18787,7 +19991,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && VT == MVT::v2i64 && + if (!Subtarget.is64Bit() && VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); @@ -18808,15 +20012,16 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, +static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); assert(VT.isVector() && "Custom lowering only for vector shifts!"); - assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); + assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; @@ -18829,7 +20034,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // XOP has 128-bit variable logical/arithmetic shifts. // +ve/-ve Amt = shift left/right. - if (Subtarget->hasXOP() && + if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) { if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { @@ -18856,7 +20061,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // i64 vector arithmetic shift can be emulated with the transform: // M = lshr(SIGN_BIT, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) - if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Op.getOpcode() == ISD::SRA) { SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); @@ -18869,10 +20074,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. - if (Op.getOpcode() == ISD::SHL && + if (ConstantAmt && Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || - (Subtarget->hasInt256() && VT == MVT::v16i16)) && - ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + (Subtarget.hasInt256() && VT == MVT::v16i16))) { SmallVector<SDValue, 8> Elts; MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); @@ -18881,7 +20085,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, for (unsigned i=0; i !=NumElems; ++i) { SDValue Op = Amt->getOperand(i); - if (Op->getOpcode() == ISD::UNDEF) { + if (Op->isUndef()) { Elts.push_back(Op); continue; } @@ -18895,7 +20099,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + SDValue BV = DAG.getBuildVector(VT, dl, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } @@ -18922,15 +20126,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing // the vector shift into four scalar shifts plus four pairs of vector // insert/extract. - if ((VT == MVT::v8i16 || VT == MVT::v4i32) && - ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { unsigned TargetOpcode = X86ISD::MOVSS; bool CanBeSimplified; // The splat value for the first packed shift (the 'X' from the example). SDValue Amt1 = Amt->getOperand(0); // The splat value for the second packed shift (the 'Y' from the example). - SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : - Amt->getOperand(2); + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of // two shifts followed by a MOVSS/MOVSD @@ -18991,7 +20193,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (VT == MVT::v4i32) { unsigned Opc = Op.getOpcode(); SDValue Amt0, Amt1, Amt2, Amt3; - if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + if (ConstantAmt) { Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); @@ -19031,14 +20233,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } if (VT == MVT::v16i8 || - (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { + (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. - if (Subtarget->hasSSE41()) { + if (Subtarget.hasSSE41()) { V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); @@ -19141,7 +20343,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // It's worth extending once and using the v8i32 shifts for 16-bit types, but // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. - if (Subtarget->hasInt256() && VT == MVT::v8i16) { + if (Subtarget.hasInt256() && VT == MVT::v8i16) { MVT ExtVT = MVT::v8i32; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; @@ -19151,13 +20353,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } - if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { + if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); - SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); - SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); @@ -19172,10 +20374,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (VT == MVT::v8i16) { unsigned ShiftOpcode = Op->getOpcode(); + // If we have a constant shift amount, the non-SSE41 path is best as + // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. + bool UseSSE41 = Subtarget.hasSSE41() && + !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); + auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. - if (Subtarget->hasSSE41()) { + if (UseSSE41) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); @@ -19192,7 +20399,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; - if (Subtarget->hasSSE41()) { + if (UseSSE41) { // On SSE41 targets we need to replicate the shift mask in both // bytes for PBLENDVB. Amt = DAG.getNode( @@ -19231,43 +20438,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } // Decompose 256-bit shifts into smaller 128-bit shifts. - if (VT.is256BitVector()) { - unsigned NumElems = VT.getVectorNumElements(); - MVT EltVT = VT.getVectorElementType(); - MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - // Extract the two vectors - SDValue V1 = Extract128BitVector(R, 0, DAG, dl); - SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); - - // Recreate the shift amount vectors - SDValue Amt1, Amt2; - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - // Constant shift amount - SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); - ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); - ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); - - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); - } else { - // Variable shift amount - Amt1 = Extract128BitVector(Amt, 0, DAG, dl); - Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); - } - - // Issue new vector shifts for the smaller types - V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); - V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); - - // Concatenate the result back - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); - } + if (VT.is256BitVector()) + return Lower256IntArith(Op, DAG); return SDValue(); } -static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -19275,7 +20452,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, SDValue Amt = Op.getOperand(1); assert(VT.isVector() && "Custom lowering only for vector rotates!"); - assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. @@ -19363,6 +20540,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_O, DL, MVT::i32), SDValue(Sum.getNode(), 2)); + if (N->getValueType(1) == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); + SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + } return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } @@ -19372,10 +20554,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(Cond, DL, MVT::i32), SDValue(Sum.getNode(), 1)); + if (N->getValueType(1) == MVT::i1) { + SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, + DAG.getValueType(MVT::i1)); + SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + } return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } @@ -19387,9 +20574,9 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) - return Subtarget->hasCmpxchg16b(); + return Subtarget.hasCmpxchg16b(); else return false; } @@ -19409,7 +20596,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { - unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available @@ -19446,16 +20633,9 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } } -static bool hasMFENCE(const X86Subtarget& Subtarget) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - return Subtarget.hasSSE2() || Subtarget.is64Bit(); -} - LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { - unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually @@ -19483,7 +20663,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. - // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. @@ -19492,7 +20672,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // the IR level, so we must wrap it in an intrinsic. return nullptr; - if (!hasMFENCE(*Subtarget)) + if (!Subtarget.hasMFence()) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare @@ -19512,7 +20692,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { return Loaded; } -static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( @@ -19522,8 +20702,9 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. - if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - if (hasMFENCE(*Subtarget)) + if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && + FenceScope == CrossThread) { + if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -19545,7 +20726,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } -static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); @@ -19557,7 +20738,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: - assert(Subtarget->is64Bit() && "Node not type legal!"); + assert(Subtarget.is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } @@ -19587,14 +20768,14 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } -static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) { - assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); @@ -19614,7 +20795,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, DAG.getIntPtrConstant(i, dl))); } else { - assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, DAG.getIntPtrConstant(0, dl))); @@ -19627,14 +20808,14 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, Elts.append(NumElts, DAG.getUNDEF(SVT)); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); + SDValue BV = DAG.getBuildVector(NewVT, dl, Elts); SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, DAG.getIntPtrConstant(0, dl)); } - assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && - Subtarget->hasMMX() && "Unexpected custom BITCAST"); + assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && + Subtarget.hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"); @@ -19657,12 +20838,11 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, /// how many bytes of V are summed horizontally to produce each element of the /// result. static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, - const X86Subtarget *Subtarget, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(V); MVT ByteVecVT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - int NumElts = VT.getVectorNumElements(); assert(ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."); assert(EltVT != MVT::i8 && @@ -19713,16 +20893,15 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s // right by 8. It is important to shift as i16s as i8 vector shift isn't // directly supported. - SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT)); - SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters); - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter); + SDValue ShifterV = DAG.getConstant(8, DL, VT); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), DAG.getBitcast(ByteVecVT, V)); - return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter); + return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); } -static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, - const X86Subtarget *Subtarget, +static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); @@ -19750,17 +20929,14 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, int NumByteElts = VecSize / 8; MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); SDValue In = DAG.getBitcast(ByteVecVT, Op); - SmallVector<SDValue, 16> LUTVec; + SmallVector<SDValue, 64> LUTVec; for (int i = 0; i < NumByteElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); - SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); - SmallVector<SDValue, 16> Mask0F(NumByteElts, - DAG.getConstant(0x0F, DL, MVT::i8)); - SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec); + SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT); // High nibbles - SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); - SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue FourV = DAG.getConstant(4, DL, ByteVecVT); SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); // Low nibbles @@ -19781,8 +20957,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); } -static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, - const X86Subtarget *Subtarget, +static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is128BitVector() && @@ -19801,19 +20977,13 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { MVT VT = V.getSimpleValueType(); - SmallVector<SDValue, 32> Shifters( - VT.getVectorNumElements(), - DAG.getConstant(Shifter, DL, VT.getVectorElementType())); - return DAG.getNode(OpCode, DL, VT, V, - DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters)); + SDValue ShifterV = DAG.getConstant(Shifter, DL, VT); + return DAG.getNode(OpCode, DL, VT, V, ShifterV); }; auto GetMask = [&](SDValue V, APInt Mask) { MVT VT = V.getSimpleValueType(); - SmallVector<SDValue, 32> Masks( - VT.getVectorNumElements(), - DAG.getConstant(Mask, DL, VT.getVectorElementType())); - return DAG.getNode(ISD::AND, DL, VT, V, - DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks)); + SDValue MaskV = DAG.getConstant(Mask, DL, VT); + return DAG.getNode(ISD::AND, DL, VT, V, MaskV); }; // We don't want to incur the implicit masks required to SRL vNi8 vectors on @@ -19852,27 +21022,38 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, DAG); } -static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - // FIXME: Need to add AVX-512 support here! - assert((VT.is256BitVector() || VT.is128BitVector()) && + assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); - if (!Subtarget->hasSSSE3()) { + if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } - if (VT.is256BitVector() && !Subtarget->hasInt256()) { + if (VT.is256BitVector() && !Subtarget.hasInt256()) { unsigned NumElems = VT.getVectorNumElements(); // Extract each 128-bit vector, compute pop count and concat the result. - SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL); - SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); + SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); + SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), + LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); + } + + if (VT.is512BitVector() && !Subtarget.hasBWI()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 256-bit vector, compute pop count and concat the result. + SDValue LHS = extract256BitVector(Op0, 0, DAG, DL); + SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), @@ -19882,26 +21063,184 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } -static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } -static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - EVT T = Node->getValueType(0); - SDValue negOp = DAG.getNode(ISD::SUB, dl, T, - DAG.getConstant(0, dl, T), Node->getOperand(2)); - return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, - cast<AtomicSDNode>(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), negOp, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); +static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + SDLoc DL(Op); + + // For scalars, its still beneficial to transfer to/from the SIMD unit to + // perform the BITREVERSE. + if (!VT.isVector()) { + MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); + SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); + Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + + MVT SVT = VT.getVectorElementType(); + int NumElts = VT.getVectorNumElements(); + int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector()) { + SDValue Lo = extract128BitVector(In, 0, DAG, DL); + SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); + + MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo), + DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi)); + } + + assert(VT.is128BitVector() && + "Only 128-bit vector bitreverse lowering supported."); + + // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we + // perform the BSWAP in the shuffle. + // Its best to shuffle using the second operand as this will implicitly allow + // memory folding for multiple vectors. + SmallVector<SDValue, 16> MaskElts; + for (int i = 0; i != NumElts; ++i) { + for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { + int SourceByte = 16 + (i * ScalarSizeInBytes) + j; + int PermuteByte = SourceByte | (2 << 5); + MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); + } + } + + SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); + SDValue Res = DAG.getBitcast(MVT::v16i8, In); + Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), + Res, Mask); + return DAG.getBitcast(VT, Res); +} + +static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + if (Subtarget.hasXOP()) + return LowerBITREVERSE_XOP(Op, DAG); + + assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); + + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + SDLoc DL(Op); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarType() == MVT::i8 && + "Only byte vector BITREVERSE supported"); + + // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. + if (VT.is256BitVector() && !Subtarget.hasInt256()) { + MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2); + SDValue Lo = extract128BitVector(In, 0, DAG, DL); + SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); + Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo); + Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + + // Perform BITREVERSE using PSHUFB lookups. Each byte is split into + // two nibbles and a PSHUFB lookup to find the bitreverse of each + // 0-15 value (moved to the other nibble). + SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); + SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); + SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); + + const int LoLUT[16] = { + /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, + /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, + /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, + /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; + const int HiLUT[16] = { + /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, + /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, + /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, + /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; + + SmallVector<SDValue, 16> LoMaskElts, HiMaskElts; + for (unsigned i = 0; i < NumElts; ++i) { + LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); + HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); + } + + SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); + SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); + Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); + Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); + return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); +} + +static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { + unsigned NewOpc = 0; + switch (N->getOpcode()) { + case ISD::ATOMIC_LOAD_ADD: + NewOpc = X86ISD::LADD; + break; + case ISD::ATOMIC_LOAD_SUB: + NewOpc = X86ISD::LSUB; + break; + case ISD::ATOMIC_LOAD_OR: + NewOpc = X86ISD::LOR; + break; + case ISD::ATOMIC_LOAD_XOR: + NewOpc = X86ISD::LXOR; + break; + case ISD::ATOMIC_LOAD_AND: + NewOpc = X86ISD::LAND; + break; + default: + llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); + } + + MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); + return DAG.getMemIntrinsicNode( + NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); +} + +/// Lower atomic_load_ops into LOCK-prefixed operations. +static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Chain = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + unsigned Opc = N->getOpcode(); + MVT VT = N->getSimpleValueType(0); + SDLoc DL(N); + + // We can lower atomic_load_add into LXADD. However, any other atomicrmw op + // can only be lowered when the result is unused. They should have already + // been transformed into a cmpxchg loop in AtomicExpand. + if (N->hasAnyUseOfValue(0)) { + // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to + // select LXADD if LOCK_SUB can't be selected. + if (Opc == ISD::ATOMIC_LOAD_SUB) { + AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); + RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, + RHS, AN->getMemOperand(), AN->getOrdering(), + AN->getSynchScope()); + } + assert(Opc == ISD::ATOMIC_LOAD_ADD && + "Used AtomicRMW ops other than Add should have been expanded!"); + return N; + } + + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG); + // RAUW the chain, but don't worry about the result, as it's unused. + assert(!N->hasAnyUseOfValue(0)); + DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); + return SDValue(); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { @@ -19914,7 +21253,8 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { // FIXME: On 32-bit, store -> fist or movq would be more efficient // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || + if (cast<AtomicSDNode>(Node)->getOrdering() == + AtomicOrdering::SequentiallyConsistent || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), @@ -19955,9 +21295,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); + assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or @@ -19991,7 +21331,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -20051,7 +21391,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) Ops.push_back(FillVal); - return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + return DAG.getBuildVector(NVT, dl, Ops); } SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); @@ -20059,9 +21399,9 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, InOp, DAG.getIntPtrConstant(0, dl)); } -static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget->hasAVX512() && + assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); // X86 scatter kills mask register, so its type should be added to @@ -20110,7 +21450,7 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, } unsigned NumElts = VT.getVectorNumElements(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && + if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -20150,68 +21490,78 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, N->getMemOperand()); DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + return SDValue(NewScatter.getNode(), 1); } -static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); - if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && - !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { - // This operation is legal for targets with VLX, but without - // VLX the vector should be widened to 512 bit - unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); - MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); - SDValue Src0 = N->getSrc0(); - Src0 = ExtendToType(Src0, WideDataVT, DAG); - Mask = ExtendToType(Mask, WideMaskVT, DAG, true); - SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), - N->getBasePtr(), Mask, Src0, - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType()); - - SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - NewLoad.getValue(0), - DAG.getIntPtrConstant(0, dl)); - SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; - return DAG.getMergeValues(RetOps, dl); - } - return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && + "Cannot lower masked load op."); + + assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + (Subtarget.hasBWI() && + (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && + "Unsupported masked load op."); + + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } -static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); SDValue DataToStore = N->getValue(); MVT VT = DataToStore.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); - if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && - !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { - // This operation is legal for targets with VLX, but without - // VLX the vector should be widened to 512 bit - unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); - MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); - DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); - Mask = ExtendToType(Mask, WideMaskVT, DAG, true); - return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), - Mask, N->getMemoryVT(), N->getMemOperand(), - N->isTruncatingStore()); - } - return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && + "Cannot lower masked store op."); + + assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + (Subtarget.hasBWI() && + (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && + "Unsupported masked store op."); + + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } -static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget->hasAVX512() && + assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); @@ -20226,7 +21576,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && + if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -20314,8 +21664,7 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, return NOOP; } -/// LowerOperation - Provide custom lowering hooks for some operations. -/// +/// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); @@ -20323,8 +21672,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); - case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); @@ -20377,14 +21731,18 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: + return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); - case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::MULHS: + case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); @@ -20417,11 +21775,34 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG); } } -/// ReplaceNodeResults - Replace a node with an illegal result type -/// with a new node built out of custom code. +/// Places new result values for the node in Results (their number +/// and types must exactly match those of the original return values of +/// the node), or leaves Results empty, which indicates that the node is not +/// to be custom lowered after all. +void X86TargetLowering::LowerOperationWrapper(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + SDValue Res = LowerOperation(SDValue(N, 0), DAG); + + if (!Res.getNode()) + return; + + assert((N->getNumValues() <= Res->getNumValues()) && + "Lowering returned the wrong number of results!"); + + // Places new result values base on N result number. + // In some cases (LowerSINT_TO_FP for example) Res has more result values + // than original node, chain should be dropped(last value). + for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) + Results.push_back(Res.getValue(I)); +} + +/// Replace a node with an illegal result type with a new node built out of +/// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG) const { @@ -20432,15 +21813,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, llvm_unreachable("Do not know how to custom type legalize this operation!"); case X86ISD::AVG: { // Legalize types for X86ISD::AVG by expanding vectors. - assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); auto InVT = N->getValueType(0); auto InVTSize = InVT.getSizeInBits(); const unsigned RegSize = (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; - assert((!Subtarget->hasAVX512() || RegSize < 512) && + assert((!Subtarget.hasAVX512() || RegSize < 512) && "512-bit vector requires AVX512"); - assert((!Subtarget->hasAVX2() || RegSize < 256) && + assert((!Subtarget.hasAVX2() || RegSize < 256) && "256-bit vector requires AVX2"); auto ElemVT = InVT.getVectorElementType(); @@ -20503,24 +21884,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT VT = N->getValueType(0); // Return a load from the stack slot. if (StackSlot.getNode()) - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, - MachinePointerInfo(), - false, false, false, 0)); + Results.push_back( + DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo())); else Results.push_back(FIST); } return; } case ISD::UINT_TO_FP: { - assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (N->getOperand(0).getValueType() != MVT::v2i32 || N->getValueType(0) != MVT::v2f32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N->getOperand(0)); - SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, - MVT::f64); - SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); + SDValue VBias = + DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); @@ -20588,20 +21967,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, dl, HalfT)); - swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, - Regs64bit ? X86::RBX : X86::EBX, - swapInL, cpInH.getValue(1)); - swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, - Regs64bit ? X86::RCX : X86::ECX, - swapInH, swapInL.getValue(1)); - SDValue Ops[] = { swapInH.getValue(0), - N->getOperand(1), - swapInH.getValue(1) }; + swapInH = + DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, + swapInH, cpInH.getValue(1)); + // If the current function needs the base pointer, RBX, + // we shouldn't use cmpxchg directly. + // Indeed the lowering of that instruction will clobber + // that register and since RBX will be a reserved register + // the register allocator will not make sure its value will + // be properly saved and restored around this live-range. + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); - unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : - X86ISD::LCMPXCHG8_DAG; - SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + if (TRI->hasBasePointer(DAG.getMachineFunction()) && + (BasePtr == X86::RBX || BasePtr == X86::EBX)) { + // ISel prefers the LCMPXCHG64 variant. + // If that assert breaks, that means it is not the case anymore, + // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX, + // not just EBX. This is a matter of accepting i64 input for that + // pseudo, and restoring into the register of the right wide + // in expand pseudo. Everything else should just work. + assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && + "Saving only half of the RBX"); + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG + : X86ISD::LCMPXCHG8_SAVE_EBX_DAG; + SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + HalfT, swapInH.getValue(1)); + SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, + RBXSave, + /*Glue*/ RBXSave.getValue(2)}; + Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + } else { + unsigned Opcode = + Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; + swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, swapInL, + swapInH.getValue(1)); + SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), + swapInL.getValue(1)}; + Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -20639,7 +22047,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::BITCAST: { - assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0)->getValueType(0); @@ -20666,7 +22074,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, ToVecInt, DAG.getIntPtrConstant(i, dl))); - Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); + Results.push_back(DAG.getBuildVector(DstVT, dl, Elts)); } } } @@ -20703,7 +22111,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; - case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -20724,7 +22131,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; - case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::ADDUS: return "X86ISD::ADDUS"; @@ -20742,7 +22148,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; @@ -20750,6 +22158,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; + case X86ISD::EH_SJLJ_SETUP_DISPATCH: + return "X86ISD::EH_SJLJ_SETUP_DISPATCH"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; @@ -20757,6 +22167,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; + case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: + return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; + case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: + return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; + case X86ISD::LADD: return "X86ISD::LADD"; + case X86ISD::LSUB: return "X86ISD::LSUB"; + case X86ISD::LOR: return "X86ISD::LOR"; + case X86ISD::LXOR: return "X86ISD::LXOR"; + case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -20778,8 +22197,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; + case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -20802,6 +22223,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; + case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; @@ -20842,6 +22264,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; @@ -20852,8 +22275,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; - case X86ISD::SFENCE: return "X86ISD::SFENCE"; - case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; @@ -20866,6 +22287,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; + case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -20878,6 +22300,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; + case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; @@ -20898,6 +22322,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; + case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; @@ -20908,26 +22333,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; + case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; + case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND"; + case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND"; } return nullptr; } -// isLegalAddressingMode - Return true if the addressing mode represented -// by AM is legal for this target, for a load/store of the specified type. +/// Return true if the addressing mode represented by AM is legal for this +/// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); - Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { - unsigned GVFlags = - Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); + unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) @@ -20939,8 +22365,8 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; // If lower 4G is not available, then we must use rip-relative addressing. - if ((M != CodeModel::Small || R != Reloc::Static) && - Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) + if ((M != CodeModel::Small || isPositionIndependent()) && + Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } @@ -20977,7 +22403,7 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make // variable shifts just as cheap as scalar ones. - if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) + if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64)) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a @@ -21026,12 +22452,12 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. - return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); + return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. - return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); + return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { @@ -21062,7 +22488,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!Subtarget->hasAnyFMA()) + if (!Subtarget.hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -21086,8 +22512,8 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { return !(VT1 == MVT::i32 && VT2 == MVT::i16); } -/// isShuffleMaskLegal - Targets can use this to indicate that they only -/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// Targets can use this to indicate that they only support *some* +/// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool @@ -21121,9 +22547,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, //===----------------------------------------------------------------------===// /// Utility function to emit xbegin specifying the start of an RTM region. -static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, +static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); @@ -21167,21 +22593,21 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, // sinkMBB: // EAX is live into the sinkMBB sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(X86::EAX); - MI->eraseFromParent(); + MI.eraseFromParent(); return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. -static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, +static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; @@ -21193,32 +22619,31 @@ static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; } - DebugLoc dl = MI->getDebugLoc(); + DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - unsigned NumArgs = MI->getNumOperands(); + unsigned NumArgs = MI.getNumOperands(); for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI->getOperand(i); + MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } - if (MI->hasOneMemOperand()) - MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + if (MI.hasOneMemOperand()) + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - BuildMI(*BB, MI, dl, - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::XMM0); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(X86::XMM0); - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } // FIXME: Custom handling because TableGen doesn't support multiple implicit // defs in an instruction pattern -static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, +static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; @@ -21230,93 +22655,90 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } - DebugLoc dl = MI->getDebugLoc(); + DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - unsigned NumArgs = MI->getNumOperands(); // remove the results + unsigned NumArgs = MI.getNumOperands(); // remove the results for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI->getOperand(i); + MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } - if (MI->hasOneMemOperand()) - MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + if (MI.hasOneMemOperand()) + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - BuildMI(*BB, MI, dl, - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::ECX); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(X86::ECX); - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } -static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget *Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); +static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget) { + DebugLoc dl = MI.getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // insert input VAL into EAX BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI->getOperand(0).getReg()); + .addReg(MI.getOperand(0).getReg()); // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) - .addReg(X86::ECX) - .addReg(X86::ECX); + BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); + // insert zero to EDX - BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) - .addReg(X86::EDX) - .addReg(X86::EDX); + BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); + // insert WRPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); - MI->eraseFromParent(); // The pseudo is gone now. + MI.eraseFromParent(); // The pseudo is gone now. return BB; } -static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget *Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); +static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget) { + DebugLoc dl = MI.getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) - .addReg(X86::ECX) - .addReg(X86::ECX); + BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); + // insert RDPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::EAX); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(X86::EAX); - MI->eraseFromParent(); // The pseudo is gone now. + MI.eraseFromParent(); // The pseudo is gone now. return BB; } -static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget *Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); +static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget, + unsigned Opc) { + DebugLoc dl = MI.getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. - unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.addOperand(MI.getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI->getOperand(ValOps).getReg()); + .addReg(MI.getOperand(ValOps).getReg()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) - .addReg(MI->getOperand(ValOps+1).getReg()); + .addReg(MI.getOperand(ValOps + 1).getReg()); // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + BuildMI(*BB, MI, dl, TII->get(Opc)); - MI->eraseFromParent(); // The pseudo is gone now. + MI.eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * -X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, +X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. @@ -21328,31 +22750,31 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) - assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); + assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); - unsigned DestReg = MI->getOperand(0).getReg(); - MachineOperand &Base = MI->getOperand(1); - MachineOperand &Scale = MI->getOperand(2); - MachineOperand &Index = MI->getOperand(3); - MachineOperand &Disp = MI->getOperand(4); - MachineOperand &Segment = MI->getOperand(5); - unsigned ArgSize = MI->getOperand(6).getImm(); - unsigned ArgMode = MI->getOperand(7).getImm(); - unsigned Align = MI->getOperand(8).getImm(); + unsigned DestReg = MI.getOperand(0).getReg(); + MachineOperand &Base = MI.getOperand(1); + MachineOperand &Scale = MI.getOperand(2); + MachineOperand &Index = MI.getOperand(3); + MachineOperand &Disp = MI.getOperand(4); + MachineOperand &Segment = MI.getOperand(5); + unsigned ArgSize = MI.getOperand(6).getImm(); + unsigned ArgMode = MI.getOperand(7).getImm(); + unsigned Align = MI.getOperand(8).getImm(); // Memory Reference - assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); + MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); // Machine Information - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); // struct va_list { // i32 gp_offset @@ -21521,7 +22943,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address - assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); + assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) @@ -21563,15 +22985,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, } // Erase the pseudo instruction - MI->eraseFromParent(); + MI.eraseFromParent(); return endMBB; } -MachineBasicBlock * -X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( - MachineInstr *MI, - MachineBasicBlock *MBB) const { +MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, @@ -21602,14 +23022,14 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); - unsigned CountReg = MI->getOperand(0).getReg(); - int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); - int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); + unsigned CountReg = MI.getOperand(0).getReg(); + int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); + int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); - if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { + if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -21618,29 +23038,29 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". - assert((MI->getNumOperands() <= 3 || - !MI->getOperand(MI->getNumOperands() - 1).isReg() || - MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) - && "Expected last argument to be EFLAGS"); - unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; + assert((MI.getNumOperands() <= 3 || + !MI.getOperand(MI.getNumOperands() - 1).isReg() || + MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && + "Expected last argument to be EFLAGS"); + unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. - for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { + for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) - .addFrameIndex(RegSaveFrameIndex) - .addImm(/*Scale=*/1) - .addReg(/*IndexReg=*/0) - .addImm(/*Disp=*/Offset) - .addReg(/*Segment=*/0) - .addReg(MI->getOperand(i).getReg()) - .addMemOperand(MMO); + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(MI.getOperand(i).getReg()) + .addMemOperand(MMO); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + MI.eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } @@ -21684,8 +23104,8 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, // Return true if it is OK for this CMOV pseudo-opcode to be cascaded // together with other CMOV pseudo-opcodes into a single basic-block with // conditional jump around it. -static bool isCMOVPseudo(MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isCMOVPseudo(MachineInstr &MI) { + switch (MI.getOpcode()) { case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_GR8: @@ -21715,10 +23135,10 @@ static bool isCMOVPseudo(MachineInstr *MI) { } MachineBasicBlock * -X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, +X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the @@ -21837,8 +23257,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // retq // MachineInstr *CascadedCMOV = nullptr; - MachineInstr *LastCMOV = MI; - X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + MachineInstr *LastCMOV = &MI; + X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); @@ -21849,8 +23269,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. - while (NextMIIt != BB->end() && - isCMOVPseudo(NextMIIt) && + while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; @@ -21860,10 +23279,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. - if (LastCMOV == MI && - NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && - NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() && + if (LastCMOV == &MI && NextMIIt != BB->end() && + NextMIIt->getOpcode() == MI.getOpcode() && + NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && + NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && NextMIIt->getOperand(1).isKill()) { CascadedCMOV = &*NextMIIt; } @@ -21885,7 +23304,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && @@ -21976,12 +23395,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). if (CascadedCMOV) { - MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); + MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, TII->get(TargetOpcode::COPY), CascadedCMOV->getOperand(0).getReg()) - .addReg(MI->getOperand(0).getReg()); + .addReg(MI.getOperand(0).getReg()); CascadedCMOV->eraseFromParent(); } @@ -21993,7 +23412,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, MachineBasicBlock *BB) const { // Combine the following atomic floating-point modification pattern: // a.store(reg OP a.load(acquire), release) @@ -22002,52 +23421,55 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, // movss %xmm, (%gpr) // Or sd equivalent for 64-bit operations. unsigned MOp, FOp; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); - case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; - case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + case X86::RELEASE_FADD32mr: + FOp = X86::ADDSSrm; + MOp = X86::MOVSSmr; + break; + case X86::RELEASE_FADD64mr: + FOp = X86::ADDSDrm; + MOp = X86::MOVSDmr; + break; } - const X86InstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - MachineOperand MSrc = MI->getOperand(0); - unsigned VSrc = MI->getOperand(5).getReg(); - const MachineOperand &Disp = MI->getOperand(3); - MachineOperand ZeroDisp = MachineOperand::CreateImm(0); - bool hasDisp = Disp.isGlobal() || Disp.isImm(); - if (hasDisp && MSrc.isReg()) - MSrc.setIsKill(false); - MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) - .addOperand(/*Base=*/MSrc) - .addImm(/*Scale=*/1) - .addReg(/*Index=*/0) - .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) - .addReg(0); - MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), - MRI.createVirtualRegister(MRI.getRegClass(VSrc))) - .addReg(VSrc) - .addOperand(/*Base=*/MSrc) - .addImm(/*Scale=*/1) - .addReg(/*Index=*/0) - .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) - .addReg(/*Segment=*/0); - MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); - MI->eraseFromParent(); // The pseudo instruction is gone now. + unsigned ValOpIdx = X86::AddrNumOperands; + unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand &Operand = MI.getOperand(i); + // Clear any kill flags on register operands as we'll create a second + // instruction using the same address operands. + if (Operand.isReg()) + Operand.setIsKill(false); + MIB.addOperand(Operand); + } + MachineInstr *FOpMI = MIB; + MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI.getOperand(i)); + MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); + MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * -X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); - const bool Is64Bit = Subtarget->is64Bit(); - const bool IsLP64 = Subtarget->isTarget64BitLP64(); + const bool Is64Bit = Subtarget.is64Bit(); + const bool IsLP64 = Subtarget.isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; @@ -22077,11 +23499,12 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), - bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), - tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), - SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), - sizeVReg = MI->getOperand(1).getReg(), - physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; + bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), + tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), + sizeVReg = MI.getOperand(1).getReg(), + physSPReg = + IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); @@ -22113,7 +23536,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = - Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); + Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -22156,43 +23579,33 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(mallocPtrVReg).addMBB(mallocMBB) - .addReg(bumpSPPtrVReg).addMBB(bumpMBB); + MI.getOperand(0).getReg()) + .addReg(mallocPtrVReg) + .addMBB(mallocMBB) + .addReg(bumpSPPtrVReg) + .addMBB(bumpMBB); // Delete the original pseudo instruction. - MI->eraseFromParent(); + MI.eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * -X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, - MachineBasicBlock *BB) const { - assert(!Subtarget->isTargetMachO()); - DebugLoc DL = MI->getDebugLoc(); - MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( - *BB->getParent(), *BB, MI, DL, false); - MachineBasicBlock *ResumeBB = ResumeMI->getParent(); - MI->eraseFromParent(); // The pseudo instruction is gone now. - return ResumeBB; -} - -MachineBasicBlock * -X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, +X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); + DebugLoc DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. - if (!Subtarget->is32Bit()) + if (!Subtarget.is32Bit()) return BB; // C++ EH creates a new target block to hold the restore code, and wires up @@ -22203,7 +23616,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); - MI->getOperand(0).setMBB(RestoreMBB); + MI.getOperand(0).setMBB(RestoreMBB); auto RestoreMBBI = RestoreMBB->begin(); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); @@ -22212,37 +23625,37 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, +X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const Constant *PerFn = MF->getFunction()->getPersonalityFn(); bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); // Only 32-bit SEH requires special handling for catchpad. - if (IsSEH && Subtarget->is32Bit()) { - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + if (IsSEH && Subtarget.is32Bit()) { + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); } - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } MachineBasicBlock * -X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI, +X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const { // So, here we replace TLSADDR with the sequence: // adjust_stackdown -> TLSADDR -> adjust_stackup. // We need this because TLSADDR is lowered into calls // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = - BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0); + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. @@ -22257,86 +23670,89 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, +X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); - assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); - assert(MI->getOperand(3).isGlobal() && "This should be a global"); + assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); + assert(MI.getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - Subtarget->is64Bit() ? - Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : - Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); - if (Subtarget->is64Bit()) { - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, - TII->get(X86::MOV64rm), X86::RDI) - .addReg(X86::RIP) - .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, - MI->getOperand(3).getTargetFlags()) - .addReg(0); + Subtarget.is64Bit() ? + Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : + Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); + if (Subtarget.is64Bit()) { + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, + MI.getOperand(3).getTargetFlags()) + .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); - } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, - TII->get(X86::MOV32rm), X86::EAX) - .addReg(0) - .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, - MI->getOperand(3).getTargetFlags()) - .addReg(0); + } else if (!isPositionIndependent()) { + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) + .addReg(0) + .addImm(0) + .addReg(0) + .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, + MI.getOperand(3).getTargetFlags()) + .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, - TII->get(X86::MOV32rm), X86::EAX) - .addReg(TII->getGlobalBaseReg(F)) - .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, - MI->getOperand(3).getTargetFlags()) - .addReg(0); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) + .addReg(TII->getGlobalBaseReg(F)) + .addImm(0) + .addReg(0) + .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, + MI.getOperand(3).getTargetFlags()) + .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * -X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, +X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; - DstReg = MI->getOperand(CurOp++).getReg(); + DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(RC->hasType(MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); @@ -22384,16 +23800,15 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); - Reloc::Model RM = MF->getTarget().getRelocationModel(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && - (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + !isPositionIndependent(); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) @@ -22406,7 +23821,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) - .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } } else @@ -22415,9 +23830,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); + MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.addOperand(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); @@ -22428,7 +23843,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -22447,7 +23862,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const bool Uses64BitFramePtr = - Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); + Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); @@ -22461,21 +23876,21 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); - MI->eraseFromParent(); + MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * -X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, +X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && @@ -22485,7 +23900,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -22500,41 +23915,275 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.addOperand(MI.getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(i), LabelOffset); + MIB.addDisp(MI.getOperand(i), LabelOffset); else - MIB.addOperand(MI->getOperand(i)); + MIB.addOperand(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) - MIB.addDisp(MI->getOperand(i), SPOffset); + MIB.addDisp(MI.getOperand(i), SPOffset); else - MIB.addOperand(MI->getOperand(i)); + MIB.addOperand(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); - MI->eraseFromParent(); + MI.eraseFromParent(); return MBB; } +void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, + MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, + int FI) const { + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); + + unsigned Op = 0; + unsigned VR = 0; + + bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && + !isPositionIndependent(); + + if (UseImmLabel) { + Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; + } else { + const TargetRegisterClass *TRC = + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + VR = MRI->createVirtualRegister(TRC); + Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + + /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */ + + if (Subtarget.is64Bit()) + BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addMBB(DispatchBB) + .addReg(0); + else + BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) + .addReg(0) /* XII->getGlobalBaseReg(MF) */ + .addImm(1) + .addReg(0) + .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) + .addReg(0); + } + + MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); + addFrameReference(MIB, FI, 36); + if (UseImmLabel) + MIB.addMBB(DispatchBB); + else + MIB.addReg(VR); +} + +MachineBasicBlock * +X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = BB->getParent(); + MachineModuleInfo *MMI = &MF->getMMI(); + MachineFrameInfo *MFI = MF->getFrameInfo(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + int FI = MFI->getFunctionContextIndex(); + + // Get a mapping of the call site numbers to all of the landing pads they're + // associated with. + DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad; + unsigned MaxCSNum = 0; + for (auto &MBB : *MF) { + if (!MBB.isEHPad()) + continue; + + MCSymbol *Sym = nullptr; + for (const auto &MI : MBB) { + if (MI.isDebugValue()) + continue; + + assert(MI.isEHLabel() && "expected EH_LABEL"); + Sym = MI.getOperand(0).getMCSymbol(); + break; + } + + if (!MMI->hasCallSiteLandingPad(Sym)) + continue; + + for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) { + CallSiteNumToLPad[CSI].push_back(&MBB); + MaxCSNum = std::max(MaxCSNum, CSI); + } + } + + // Get an ordered list of the machine basic blocks for the jump table. + std::vector<MachineBasicBlock *> LPadList; + SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs; + LPadList.reserve(CallSiteNumToLPad.size()); + + for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { + for (auto &LP : CallSiteNumToLPad[CSI]) { + LPadList.push_back(LP); + InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); + } + } + + assert(!LPadList.empty() && + "No landing pad destinations for the dispatch jump table!"); + + // Create the MBBs for the dispatch code. + + // Shove the dispatch's address into the return slot in the function context. + MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); + DispatchBB->setIsEHPad(true); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + BuildMI(TrapBB, DL, TII->get(X86::TRAP)); + DispatchBB->addSuccessor(TrapBB); + + MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); + DispatchBB->addSuccessor(DispContBB); + + // Insert MBBs. + MF->push_back(DispatchBB); + MF->push_back(DispContBB); + MF->push_back(TrapBB); + + // Insert code into the entry block that creates and registers the function + // context. + SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); + + // Create the jump table and associated information + MachineJumpTableInfo *JTI = + MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); + unsigned MJTI = JTI->createJumpTableIndex(LPadList); + + const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); + const X86RegisterInfo &RI = XII->getRegisterInfo(); + + // Add a register mask with no preserved registers. This results in all + // registers being marked as clobbered. + if (RI.hasBasePointer(*MF)) { + const bool FPIs64Bit = + Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setRestoreBasePointer(MF); + + unsigned FP = RI.getFrameRegister(*MF); + unsigned BP = RI.getBaseRegister(); + unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; + addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, + MFI->getRestoreBasePointerOffset()) + .addRegMask(RI.getNoPreservedMask()); + } else { + BuildMI(DispatchBB, DL, TII->get(X86::NOOP)) + .addRegMask(RI.getNoPreservedMask()); + } + + unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass); + addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, + 4); + BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) + .addReg(IReg) + .addImm(LPadList.size()); + BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB); + + unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass); + BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg) + .addReg(IReg) + .addImm(1); + BuildMI(DispContBB, DL, + TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m)) + .addReg(0) + .addImm(Subtarget.is64Bit() ? 8 : 4) + .addReg(JReg) + .addJumpTableIndex(MJTI) + .addReg(0); + + // Add the jump table entries as successors to the MBB. + SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs; + for (auto &LP : LPadList) + if (SeenMBBs.insert(LP).second) + DispContBB->addSuccessor(LP); + + // N.B. the order the invoke BBs are processed in doesn't matter here. + SmallVector<MachineBasicBlock *, 64> MBBLPads; + const MCPhysReg *SavedRegs = + Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF); + for (MachineBasicBlock *MBB : InvokeBBs) { + // Remove the landing pad successor from the invoke block and replace it + // with the new dispatch block. + // Keep a copy of Successors since it's modified inside the loop. + SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(), + MBB->succ_rend()); + // FIXME: Avoid quadratic complexity. + for (auto MBBS : Successors) { + if (MBBS->isEHPad()) { + MBB->removeSuccessor(MBBS); + MBBLPads.push_back(MBBS); + } + } + + MBB->addSuccessor(DispatchBB); + + // Find the invoke call and mark all of the callee-saved registers as + // 'implicit defined' so that they're spilled. This prevents code from + // moving instructions to before the EH block, where they will never be + // executed. + for (auto &II : reverse(*MBB)) { + if (!II.isCall()) + continue; + + DenseMap<unsigned, bool> DefRegs; + for (auto &MOp : II.operands()) + if (MOp.isReg()) + DefRegs[MOp.getReg()] = true; + + MachineInstrBuilder MIB(*MF, &II); + for (unsigned RI = 0; SavedRegs[RI]; ++RI) { + unsigned Reg = SavedRegs[RI]; + if (!DefRegs[Reg]) + MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); + } + + break; + } + } + + // Mark all former landing pads as non-landing pads. The dispatch is the only + // landing pad now. + for (auto &LP : MBBLPads) + LP->setIsEHPad(false); + + // The instruction is gone now. + MI.eraseFromParent(); + return BB; +} + // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer // to remove extra copies in the loop. // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). MachineBasicBlock * -X86TargetLowering::emitFMA3Instr(MachineInstr *MI, +X86TargetLowering::emitFMA3Instr(MachineInstr &MI, MachineBasicBlock *MBB) const { - MachineOperand &AddendOp = MI->getOperand(3); + MachineOperand &AddendOp = MI.getOperand(3); // Bail out early if the addend isn't a register - we can't switch these. if (!AddendOp.isReg()) @@ -22565,55 +24214,120 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, assert(AddendDef.getOperand(i).isReg()); MachineOperand PHISrcOp = AddendDef.getOperand(i); MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); - if (&PHISrcInst == MI) { + if (&PHISrcInst == &MI) { // Found a matching instruction. unsigned NewFMAOpc = 0; - switch (MI->getOpcode()) { - case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; - case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; - case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; - case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; - case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; - case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; - case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; - case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; - case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; - case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; - case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; - case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; - case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; - case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; - case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; - case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; - case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; - case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; - case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; - case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; - - case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; - case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; - case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; - case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; - case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; - case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; - case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; - case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; - case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; - case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; - case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; - case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; - default: llvm_unreachable("Unrecognized FMA variant."); + switch (MI.getOpcode()) { + case X86::VFMADDPDr213r: + NewFMAOpc = X86::VFMADDPDr231r; + break; + case X86::VFMADDPSr213r: + NewFMAOpc = X86::VFMADDPSr231r; + break; + case X86::VFMADDSDr213r: + NewFMAOpc = X86::VFMADDSDr231r; + break; + case X86::VFMADDSSr213r: + NewFMAOpc = X86::VFMADDSSr231r; + break; + case X86::VFMSUBPDr213r: + NewFMAOpc = X86::VFMSUBPDr231r; + break; + case X86::VFMSUBPSr213r: + NewFMAOpc = X86::VFMSUBPSr231r; + break; + case X86::VFMSUBSDr213r: + NewFMAOpc = X86::VFMSUBSDr231r; + break; + case X86::VFMSUBSSr213r: + NewFMAOpc = X86::VFMSUBSSr231r; + break; + case X86::VFNMADDPDr213r: + NewFMAOpc = X86::VFNMADDPDr231r; + break; + case X86::VFNMADDPSr213r: + NewFMAOpc = X86::VFNMADDPSr231r; + break; + case X86::VFNMADDSDr213r: + NewFMAOpc = X86::VFNMADDSDr231r; + break; + case X86::VFNMADDSSr213r: + NewFMAOpc = X86::VFNMADDSSr231r; + break; + case X86::VFNMSUBPDr213r: + NewFMAOpc = X86::VFNMSUBPDr231r; + break; + case X86::VFNMSUBPSr213r: + NewFMAOpc = X86::VFNMSUBPSr231r; + break; + case X86::VFNMSUBSDr213r: + NewFMAOpc = X86::VFNMSUBSDr231r; + break; + case X86::VFNMSUBSSr213r: + NewFMAOpc = X86::VFNMSUBSSr231r; + break; + case X86::VFMADDSUBPDr213r: + NewFMAOpc = X86::VFMADDSUBPDr231r; + break; + case X86::VFMADDSUBPSr213r: + NewFMAOpc = X86::VFMADDSUBPSr231r; + break; + case X86::VFMSUBADDPDr213r: + NewFMAOpc = X86::VFMSUBADDPDr231r; + break; + case X86::VFMSUBADDPSr213r: + NewFMAOpc = X86::VFMSUBADDPSr231r; + break; + + case X86::VFMADDPDr213rY: + NewFMAOpc = X86::VFMADDPDr231rY; + break; + case X86::VFMADDPSr213rY: + NewFMAOpc = X86::VFMADDPSr231rY; + break; + case X86::VFMSUBPDr213rY: + NewFMAOpc = X86::VFMSUBPDr231rY; + break; + case X86::VFMSUBPSr213rY: + NewFMAOpc = X86::VFMSUBPSr231rY; + break; + case X86::VFNMADDPDr213rY: + NewFMAOpc = X86::VFNMADDPDr231rY; + break; + case X86::VFNMADDPSr213rY: + NewFMAOpc = X86::VFNMADDPSr231rY; + break; + case X86::VFNMSUBPDr213rY: + NewFMAOpc = X86::VFNMSUBPDr231rY; + break; + case X86::VFNMSUBPSr213rY: + NewFMAOpc = X86::VFNMSUBPSr231rY; + break; + case X86::VFMADDSUBPDr213rY: + NewFMAOpc = X86::VFMADDSUBPDr231rY; + break; + case X86::VFMADDSUBPSr213rY: + NewFMAOpc = X86::VFMADDSUBPSr231rY; + break; + case X86::VFMSUBADDPDr213rY: + NewFMAOpc = X86::VFMSUBADDPDr231rY; + break; + case X86::VFMSUBADDPSr213rY: + NewFMAOpc = X86::VFMSUBADDPSr231rY; + break; + default: + llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineInstrBuilder MIB = - BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(1)); + BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(1)); MBB->insert(MachineBasicBlock::iterator(MI), MIB); - MI->eraseFromParent(); + MI.eraseFromParent(); } } @@ -22621,9 +24335,9 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, +X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: @@ -22641,8 +24355,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); - case X86::WIN_ALLOCA: - return EmitLoweredWinAlloca(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: @@ -22679,31 +24391,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::RDFLAGS32: case X86::RDFLAGS64: { - DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); unsigned PushF = - MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; - unsigned Pop = - MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; - BuildMI(*BB, MI, DL, TII->get(PushF)); - BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); - - MI->eraseFromParent(); // The pseudo is gone now. + MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF)); + // Permit reads of the FLAGS register without it being defined. + // This intrinsic exists to read external processor state in flags, such as + // the trap flag, interrupt flag, and direction flag, none of which are + // modeled by the backend. + Push->getOperand(2).setIsUndef(); + BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg()); + + MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::WRFLAGS32: case X86::WRFLAGS64: { - DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); unsigned Push = - MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; unsigned PopF = - MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; - BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg()); BuildMI(*BB, MI, DL, TII->get(PopF)); - MI->eraseFromParent(); // The pseudo is gone now. + MI.eraseFromParent(); // The pseudo is gone now. return BB; } @@ -22721,8 +24437,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. @@ -22750,7 +24466,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Get the X86 opcode to use. unsigned Opc; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; @@ -22763,35 +24479,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; } - X86AddressMode AM; - MachineOperand &Op = MI->getOperand(0); - if (Op.isReg()) { - AM.BaseType = X86AddressMode::RegBase; - AM.Base.Reg = Op.getReg(); - } else { - AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = Op.getIndex(); - } - Op = MI->getOperand(1); - if (Op.isImm()) - AM.Scale = Op.getImm(); - Op = MI->getOperand(2); - if (Op.isImm()) - AM.IndexReg = Op.getImm(); - Op = MI->getOperand(3); - if (Op.isGlobal()) { - AM.GV = Op.getGlobal(); - } else { - AM.Disp = Op.getImm(); - } + X86AddressMode AM = getAddressFromInstr(&MI, 0); addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) - .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); + .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); - MI->eraseFromParent(); // The pseudo instruction is gone now. + MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } // String/text processing lowering. @@ -22803,9 +24499,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: case X86::VPCMPESTRM128MEM: - assert(Subtarget->hasSSE42() && + assert(Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); + return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -22816,21 +24512,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIREG: case X86::PCMPESTRIMEM: case X86::VPCMPESTRIMEM: - assert(Subtarget->hasSSE42() && + assert(Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); + return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, Subtarget); + return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); + case X86::MONITORX: + return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); // PKU feature case X86::WRPKRU: - return EmitWRPKRU(MI, BB, Subtarget); + return emitWRPKRU(MI, BB, Subtarget); case X86::RDPKRU: - return EmitRDPKRU(MI, BB, Subtarget); + return emitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); + return emitXBegin(MI, BB, Subtarget.getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -22846,6 +24544,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); + case X86::Int_eh_sjlj_setup_dispatch: + return EmitSjLjDispatchBlock(MI, BB); + case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. @@ -22888,6 +24589,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); + case X86::LCMPXCHG8B_SAVE_EBX: + case X86::LCMPXCHG16B_SAVE_RBX: { + unsigned BasePtr = + MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; + if (!BB->isLiveIn(BasePtr)) + BB->addLiveIn(BasePtr); + return BB; + } } } @@ -22930,33 +24639,9 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - unsigned NumLoBits = 0; - switch (IntId) { - default: break; - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx2_pmovmskb: { - // High bits of movmskp{s|d}, pmovmskb are known zero. - switch (IntId) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; - case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; - case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; - case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; - case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; - case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; - case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; - } - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); - break; - } - } + case X86ISD::MOVMSK: { + unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } @@ -22974,8 +24659,8 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return 1; } -/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the -/// node is a GlobalAddress + offset. +/// Returns true (and the GlobalValue and the offset) if the node is a +/// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const { @@ -22989,11 +24674,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// Performs shuffle combines for 256-bit vectors. /// FIXME: This could be expanded to support 512 bit vectors as well. -static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget* Subtarget) { +static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc dl(N); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); @@ -23014,8 +24699,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // RESULT: V + zero extended // if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || - V2.getOperand(1).getOpcode() != ISD::UNDEF || - V1.getOperand(1).getOpcode() != ISD::UNDEF) + !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef()) return SDValue(); if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) @@ -23060,195 +24744,556 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); + SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); return DCI.CombineTo(N, InsV); } return SDValue(); } +// Attempt to match a combined shuffle mask against supported unary shuffle +// instructions. +// TODO: Investigate sharing more of this with shuffle lowering. +static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT) { + bool FloatDomain = SrcVT.isFloatingPoint() || + (!Subtarget.hasAVX2() && SrcVT.is256BitVector()); + + // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. + if (!FloatDomain && SrcVT.is128BitVector() && + isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) { + Shuffle = X86ISD::VZEXT_MOVL; + ShuffleVT = MVT::v2i64; + return true; + } + + // Check if we have SSE3 which will let us use MOVDDUP etc. The + // instructions are no slower than UNPCKLPD but has the option to + // fold the input operand into even an unaligned memory load. + if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0})) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + return true; + } + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { + Shuffle = X86ISD::MOVSLDUP; + ShuffleVT = MVT::v4f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { + Shuffle = X86ISD::MOVSHDUP; + ShuffleVT = MVT::v4f32; + return true; + } + } + + if (SrcVT.is256BitVector() && FloatDomain) { + assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v4f64; + return true; + } + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { + Shuffle = X86ISD::MOVSLDUP; + ShuffleVT = MVT::v8f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { + Shuffle = X86ISD::MOVSHDUP; + ShuffleVT = MVT::v8f32; + return true; + } + } + + if (SrcVT.is512BitVector() && FloatDomain) { + assert(Subtarget.hasAVX512() && + "AVX512 required for 512-bit vector shuffles"); + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v8f64; + return true; + } + if (isTargetShuffleEquivalent( + Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { + Shuffle = X86ISD::MOVSLDUP; + ShuffleVT = MVT::v16f32; + return true; + } + if (isTargetShuffleEquivalent( + Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { + Shuffle = X86ISD::MOVSHDUP; + ShuffleVT = MVT::v16f32; + return true; + } + } + + // Attempt to match against broadcast-from-vector. + if (Subtarget.hasAVX2()) { + unsigned NumElts = Mask.size(); + SmallVector<int, 64> BroadcastMask(NumElts, 0); + if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { + unsigned EltSize = SrcVT.getSizeInBits() / NumElts; + ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize) + : MVT::getIntegerVT(EltSize); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts); + Shuffle = X86ISD::VBROADCAST; + return true; + } + } + + return false; +} + +// Attempt to match a combined shuffle mask against supported unary immediate +// permute instructions. +// TODO: Investigate sharing more of this with shuffle lowering. +static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + // Ensure we don't contain any zero elements. + for (int M : Mask) { + if (M == SM_SentinelZero) + return false; + assert(SM_SentinelUndef <= M && M < (int)Mask.size() && + "Expected unary shuffle"); + } + + unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size(); + MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); + + // Handle PSHUFLW/PSHUFHW repeated patterns. + if (MaskScalarSizeInBits == 16) { + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + ArrayRef<int> LoMask(Mask.data() + 0, 4); + ArrayRef<int> HiMask(Mask.data() + 4, 4); + + // PSHUFLW: permute lower 4 elements only. + if (isUndefOrInRange(LoMask, 0, 4) && + isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { + Shuffle = X86ISD::PSHUFLW; + ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); + PermuteImm = getV4X86ShuffleImm(LoMask); + return true; + } + + // PSHUFHW: permute upper 4 elements only. + if (isUndefOrInRange(HiMask, 4, 8) && + isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { + // Offset the HiMask so that we can create the shuffle immediate. + int OffsetHiMask[4]; + for (int i = 0; i != 4; ++i) + OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); + + Shuffle = X86ISD::PSHUFHW; + ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); + PermuteImm = getV4X86ShuffleImm(OffsetHiMask); + return true; + } + + return false; + } + return false; + } + + // We only support permutation of 32/64 bit elements after this. + if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) + return false; + + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + bool FloatDomain = SrcVT.isFloatingPoint(); + if (FloatDomain && !Subtarget.hasAVX()) + return false; + + // Pre-AVX2 we must use float shuffles on 256-bit vectors. + if (SrcVT.is256BitVector() && !Subtarget.hasAVX2()) + FloatDomain = true; + + // Check for lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { + // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). + if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + PermuteImm = getV4X86ShuffleImm(Mask); + return true; + } + if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) { + SmallVector<int, 4> RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + PermuteImm = getV4X86ShuffleImm(RepeatedMask); + return true; + } + } + return false; + } + + // VPERMILPD can permute with a non-repeating shuffle. + if (FloatDomain && MaskScalarSizeInBits == 64) { + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } + return true; + } + + // We need a repeating shuffle mask for VPERMILPS/PSHUFD. + SmallVector<int, 4> RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) + return false; + + // Narrow the repeated mask for 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + scaleShuffleMask(2, RepeatedMask, WordMask); + + Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; +} + +// Attempt to match a combined unary shuffle mask against supported binary +// shuffle instructions. +// TODO: Investigate sharing more of this with shuffle lowering. +static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, + unsigned &Shuffle, MVT &ShuffleVT) { + bool FloatDomain = SrcVT.isFloatingPoint(); + + if (SrcVT.is128BitVector()) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + Shuffle = X86ISD::MOVLHPS; + ShuffleVT = MVT::v4f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + Shuffle = X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { + Shuffle = X86ISD::UNPCKL; + ShuffleVT = MVT::v4f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { + Shuffle = X86ISD::UNPCKH; + ShuffleVT = MVT::v4f32; + return true; + } + if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || + isTargetShuffleEquivalent( + Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { + Shuffle = X86ISD::UNPCKL; + ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; + return true; + } + if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || + isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, + 13, 14, 14, 15, 15})) { + Shuffle = X86ISD::UNPCKH; + ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; + return true; + } + } + + return false; +} + /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// -/// This is the leaf of the recursive combinine below. When we have found some +/// This is the leaf of the recursive combine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. -static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, - int Depth, bool HasPSHUFB, SelectionDAG &DAG, +static bool combineX86ShuffleChain(SDValue Input, SDValue Root, + ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { - assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); + const X86Subtarget &Subtarget) { + assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); // Find the operand that enters the chain. Note that multiple uses are OK // here, we're not going to remove the operand we find. - SDValue Input = Op.getOperand(0); - while (Input.getOpcode() == ISD::BITCAST) - Input = Input.getOperand(0); + Input = peekThroughBitcasts(Input); MVT VT = Input.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); - if (Mask.size() == 1) { - int Index = Mask[0]; - assert((Index >= 0 || Index == SM_SentinelUndef || - Index == SM_SentinelZero) && - "Invalid shuffle index found!"); - - // We may end up with an accumulated mask of size 1 as a result of - // widening of shuffle operands (see function canWidenShuffleElements). - // If the only shuffle index is equal to SM_SentinelZero then propagate - // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle - // mask, and therefore the entire chain of shuffles can be folded away. - if (Index == SM_SentinelZero) - DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); - else - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), - /*AddTo*/ true); + SDValue Res; + + unsigned NumBaseMaskElts = BaseMask.size(); + if (NumBaseMaskElts == 1) { + assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + /*AddTo*/ true); return true; } - // Use the float domain if the operand type is a floating point type. - bool FloatDomain = VT.isFloatingPoint(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; - // For floating point shuffles, we don't have free copies in the shuffle - // instructions or the ability to load as part of the instruction, so - // canonicalize their shuffles to UNPCK or MOV variants. - // - // Note that even with AVX we prefer the PSHUFD form of shuffle for integer - // vectors because it can have a load folded into it that UNPCK cannot. This - // doesn't preclude something switching to the shorter encoding post-RA. - // - // FIXME: Should teach these routines about AVX vector widths. - if (FloatDomain && VT.is128BitVector()) { - if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { - bool Lo = Mask.equals({0, 0}); - unsigned Shuffle; - MVT ShuffleVT; - // Check if we have SSE3 which will let us use MOVDDUP. That instruction - // is no slower than UNPCKLPD but has the option to fold the input operand - // into even an unaligned memory load. - if (Lo && Subtarget->hasSSE3()) { - Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v2f64; - } else { - // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller - // than the UNPCK variants. - Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; - ShuffleVT = MVT::v4f32; - } - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - Op = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - if (Shuffle == X86ISD::MOVDDUP) - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); - else - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), - /*AddTo*/ true); - return true; - } - if (Subtarget->hasSSE3() && - (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { - bool Lo = Mask.equals({0, 0, 2, 2}); - unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; - MVT ShuffleVT = MVT::v4f32; - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - Op = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + // Don't combine if we are a AVX512/EVEX target and the mask element size + // is different from the root element size - this would prevent writemasks + // from being reused. + // TODO - this currently prevents all lane shuffles from occurring. + // TODO - check for writemasks usage instead of always preventing combining. + // TODO - attempt to narrow Mask back to writemask size. + if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits && + (RootSizeInBits == 512 || + (Subtarget.hasVLX() && RootSizeInBits >= 128))) { + return false; + } + + // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. + + // Handle 128-bit lane shuffles of 256-bit vectors. + if (VT.is256BitVector() && NumBaseMaskElts == 2 && + !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) + return false; // Nothing to do! + MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64 + : MVT::v4i64); + unsigned PermMask = 0; + PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); + PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); + + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, + DAG.getUNDEF(ShuffleVT), + DAG.getConstant(PermMask, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + // For masks that have been widened to 128-bit elements or more, + // narrow back down to 64-bit elements. + SmallVector<int, 64> Mask; + if (BaseMaskEltSizeInBits > 64) { + assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); + int MaskScale = BaseMaskEltSizeInBits / 64; + scaleShuffleMask(MaskScale, BaseMask, Mask); + } else { + Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end()); + } + + unsigned NumMaskElts = Mask.size(); + unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; + + // Determine the effective mask value type. + bool FloatDomain = + (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) && + (32 <= MaskEltSizeInBits); + MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) + : MVT::getIntegerVT(MaskEltSizeInBits); + MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); + + // Attempt to match the mask against known shuffle patterns. + MVT ShuffleVT; + unsigned Shuffle, PermuteImm; + + if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT, + PermuteImm)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, + DAG.getConstant(PermuteImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + // Attempt to blend with zero. + if (NumMaskElts <= 8 && + ((Subtarget.hasSSE41() && VT.is128BitVector()) || + (Subtarget.hasAVX() && VT.is256BitVector()))) { + // Convert VT to a type compatible with X86ISD::BLENDI. + // TODO - add 16i16 support (requires lane duplication). + MVT ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } + + if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, + /*Low*/ 0) && + NumMaskElts <= ShuffleVT.getVectorNumElements()) { + unsigned BlendMask = 0; + unsigned ShuffleSize = ShuffleVT.getVectorNumElements(); + unsigned MaskRatio = ShuffleSize / NumMaskElts; + + if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI) + return false; + + for (unsigned i = 0; i != ShuffleSize; ++i) + if (Mask[i / MaskRatio] < 0) + BlendMask |= 1u << i; + + SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL); + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero, + DAG.getConstant(BlendMask, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } - if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { - bool Lo = Mask.equals({0, 0, 1, 1}); - unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - MVT ShuffleVT = MVT::v4f32; - if (Depth == 1 && Root->getOpcode() == Shuffle) + } + + // Attempt to combine to INSERTPS. + if (Subtarget.hasSSE41() && NumMaskElts == 4 && + (VT == MVT::v2f64 || VT == MVT::v4f32)) { + SmallBitVector Zeroable(4, false); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] < 0) + Zeroable[i] = true; + + unsigned InsertPSMask; + SDValue V1 = Input, V2 = Input; + if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, + Zeroable, Mask, DAG)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) return false; // Nothing to do! - Op = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + V1 = DAG.getBitcast(MVT::v4f32, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MVT::v4f32, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } } - // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK - // variants as none of these have single-instruction variants that are - // superior to the UNPCK formulation. - if (!FloatDomain && VT.is128BitVector() && - (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || - Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || - Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || - Mask.equals( - {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { - bool Lo = Mask[0] == 0; - unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - MVT ShuffleVT; - switch (Mask.size()) { - case 8: - ShuffleVT = MVT::v8i16; - break; - case 16: - ShuffleVT = MVT::v16i8; - break; - default: - llvm_unreachable("Impossible mask size!"); - }; - Op = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), - /*AddTo*/ true); - return true; - } - // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) return false; - // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we - // can replace them with a single PSHUFB instruction profitably. Intel's - // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but - // in practice PSHUFB tends to be *very* fast so we're more aggressive. - if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { + if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) + return false; + + bool MaskContainsZeros = + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + + // If we have a single input shuffle with different shuffle patterns in the + // the 128-bit lanes use the variable mask to VPERMILPS. + // TODO Combine other mask types at higher depths. + if (HasVariableMask && !MaskContainsZeros && + ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || + (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { + SmallVector<SDValue, 16> VPermIdx; + for (int M : Mask) { + SDValue Idx = + M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); + VPermIdx.push_back(Idx); + } + MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); + SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + DCI.AddToWorklist(VPermMask.getNode()); + Res = DAG.getBitcast(MaskVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + // If we have 3 or more shuffle instructions or a chain involving a variable + // mask, we can replace them with a single PSHUFB instruction profitably. + // Intel's manuals suggest only using PSHUFB if doing so replacing 5 + // instructions, but in practice PSHUFB tends to be *very* fast so we're + // more aggressive. + if ((Depth >= 3 || HasVariableMask) && + ((VT.is128BitVector() && Subtarget.hasSSSE3()) || + (VT.is256BitVector() && Subtarget.hasAVX2()) || + (VT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector<SDValue, 16> PSHUFBMask; int NumBytes = VT.getSizeInBits() / 8; - int Ratio = NumBytes / Mask.size(); + int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { - if (Mask[i / Ratio] == SM_SentinelUndef) { + int M = Mask[i / Ratio]; + if (M == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } - int M = Mask[i / Ratio] != SM_SentinelZero - ? Ratio * Mask[i / Ratio] + i % Ratio - : 255; + if (M == SM_SentinelZero) { + PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); + continue; + } + M = Ratio * M + i % Ratio; + assert ((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); - Op = DAG.getBitcast(ByteVT, Input); - DCI.AddToWorklist(Op.getNode()); - SDValue PSHUFBMaskOp = - DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); + Res = DAG.getBitcast(ByteVT, Input); + DCI.AddToWorklist(Res.getNode()); + SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); - Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } @@ -23288,10 +25333,10 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, ArrayRef<int> RootMask, - int Depth, bool HasPSHUFB, + int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth > 8) @@ -23310,13 +25355,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); - if (!isTargetShuffle(Op.getOpcode())) - return false; + // Extract target shuffle mask and resolve sentinels and inputs. + SDValue Input0, Input1; SmallVector<int, 16> OpMask; - bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); - // We only can combine unary shuffles which we can decode the mask for. - if (!HaveMask || !IsUnary) + if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; assert(VT.getVectorNumElements() == OpMask.size() && @@ -23327,6 +25369,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); + int MaskWidth = std::max<int>(OpMask.size(), RootMask.size()); int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); assert(((RootRatio == 1 && OpRatio == 1) || @@ -23334,13 +25377,13 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, "Must not have a ratio for both incoming and op masks!"); SmallVector<int, 16> Mask; - Mask.reserve(std::max(OpMask.size(), RootMask.size())); + Mask.reserve(MaskWidth); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. - for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + for (int i = 0; i < MaskWidth; ++i) { int RootIdx = i / RootRatio; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. @@ -23362,45 +25405,56 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, RootMaskedIdx % OpRatio); } - // See if we can recurse into the operand to combine more things. - switch (Op.getOpcode()) { - case X86ISD::PSHUFB: - HasPSHUFB = true; - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - if (Op.getOperand(0).hasOneUse() && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + // Handle the all undef/zero cases early. + if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) { + DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType())); + return true; + } + if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) { + // TODO - should we handle the mixed zero/undef case as well? Just returning + // a zero mask will lose information on undef elements possibly reducing + // future combine possibilities. + DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(), + Subtarget, DAG, SDLoc(Root))); + return true; + } - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - assert(Op.getOperand(0) == Op.getOperand(1) && - "We only combine unary shuffles!"); - // We can't check for single use, we have to check that this shuffle is the - // only user. - if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + int MaskSize = Mask.size(); + bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); + bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return MaskSize <= Idx; }); + + // At the moment we can only combine unary shuffle mask cases. + if (UseInput0 && UseInput1) + return false; + else if (UseInput1) { + std::swap(Input0, Input1); + ShuffleVectorSDNode::commuteMask(Mask); } + assert(Input0 && "Shuffle with no inputs detected"); + + HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); + + // See if we can recurse into Input0 (if it's a target shuffle). + if (Op->isOnlyUserOf(Input0.getNode()) && + combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, + HasVariableMask, DAG, DCI, Subtarget)) + return true; + // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with squential pairs of + // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. SmallVector<int, 16> WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); - WidenedMask.clear(); } - return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, - Subtarget); + return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG, + DCI, Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. @@ -23410,8 +25464,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; + SmallVector<SDValue, 2> Ops; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); + bool HaveMask = + getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -23647,9 +25703,9 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, } /// \brief Try to combine x86 target specific shuffles. -static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; @@ -23681,8 +25737,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, auto Op0 = N.getOperand(0); auto Op1 = N.getOperand(1); - if (Op0.getOpcode() == ISD::UNDEF && - Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); @@ -23719,6 +25774,129 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); } + // Attempt to merge blend(insertps(x,y),zero). + if (V0.getOpcode() == X86ISD::INSERTPS || + V1.getOpcode() == X86ISD::INSERTPS) { + assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); + + // Determine which elements are known to be zero. + SmallVector<int, 8> TargetMask; + SmallVector<SDValue, 2> BlendOps; + if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps)) + return SDValue(); + + // Helper function to take inner insertps node and attempt to + // merge the blend with zero into its zero mask. + auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) { + if (V.getOpcode() != X86ISD::INSERTPS) + return SDValue(); + SDValue Op0 = V.getOperand(0); + SDValue Op1 = V.getOperand(1); + SDValue Op2 = V.getOperand(2); + unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); + + // Check each element of the blend node's target mask - must either + // be zeroable (and update the zero mask) or selects the element from + // the inner insertps node. + for (int i = 0; i != 4; ++i) + if (TargetMask[i] < 0) + InsertPSMask |= (1u << i); + else if (TargetMask[i] != (i + Offset)) + return SDValue(); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + }; + + if (SDValue V = MergeInsertPSAndBlend(V0, 0)) + return V; + if (SDValue V = MergeInsertPSAndBlend(V1, 4)) + return V; + } + return SDValue(); + } + case X86ISD::INSERTPS: { + assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); + SDValue Op0 = N.getOperand(0); + SDValue Op1 = N.getOperand(1); + SDValue Op2 = N.getOperand(2); + unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; + unsigned DstIdx = (InsertPSMask >> 4) & 0x3; + unsigned ZeroMask = InsertPSMask & 0xF; + + // If we zero out all elements from Op0 then we don't need to reference it. + if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + + // If we zero out the element from Op1 then we don't need to reference it. + if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + + // Attempt to merge insertps Op1 with an inner target shuffle node. + SmallVector<int, 8> TargetMask1; + SmallVector<SDValue, 2> Ops1; + if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { + int M = TargetMask1[SrcIdx]; + if (isUndefOrZero(M)) { + // Zero/UNDEF insertion - zero out element and remove dependency. + InsertPSMask |= (1u << DstIdx); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + // Update insertps mask srcidx and reference the source input directly. + assert(0 <= M && M < 8 && "Shuffle index out of range"); + InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); + Op1 = Ops1[M < 4 ? 0 : 1]; + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + + // Attempt to merge insertps Op0 with an inner target shuffle node. + SmallVector<int, 8> TargetMask0; + SmallVector<SDValue, 2> Ops0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) + return SDValue(); + + bool Updated = false; + bool UseInput00 = false; + bool UseInput01 = false; + for (int i = 0; i != 4; ++i) { + int M = TargetMask0[i]; + if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { + // No change if element is already zero or the inserted element. + continue; + } else if (isUndefOrZero(M)) { + // If the target mask is undef/zero then we must zero the element. + InsertPSMask |= (1u << i); + Updated = true; + continue; + } + + // The input vector element must be inline. + if (M != i && M != (i + 4)) + return SDValue(); + + // Determine which inputs of the target shuffle we're using. + UseInput00 |= (0 <= M && M < 4); + UseInput01 |= (4 <= M); + } + + // If we're not using both inputs of the target shuffle then use the + // referenced input directly. + if (UseInput00 && !UseInput01) { + Updated = true; + Op0 = Ops0[0]; + } else if (!UseInput00 && UseInput01) { + Updated = true; + Op0 = Ops0[1]; + } + + if (Updated) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + return SDValue(); } default: @@ -23814,12 +25992,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); - if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); // We only handle target-independent shuffles. @@ -23865,13 +26043,10 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } -/// PerformShuffleCombine - Performs several different shuffle combines. -static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc dl(N); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. @@ -23886,9 +26061,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) - return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + return combineShuffle256(N, DAG, DCI, Subtarget); // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. @@ -23903,8 +26078,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // potentially need to be further expanded (or custom lowered) into a // less optimal sequence of dag nodes. if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && - N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && - N0.getOpcode() == ISD::BITCAST) { + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue BC0 = N0.getOperand(0); EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); @@ -23936,7 +26115,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); - return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); } } } @@ -23952,9 +26131,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return LD; if (isTargetShuffle(N->getOpcode())) { - SDValue Shuffle = - PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); - if (Shuffle.getNode()) + if (SDValue Shuffle = + combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle @@ -23973,8 +26151,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target -/// specific shuffle of a load can be folded into a single element load. +/// Check if a vector extract from a target-specific shuffle of a load can be +/// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, @@ -24012,9 +26190,10 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, return SDValue(); SmallVector<int, 16> ShuffleMask; + SmallVector<SDValue, 2> ShuffleOps; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleMask, UnaryShuffle)) + ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. @@ -24029,12 +26208,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, return DAG.getUNDEF(EltVT); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) - : InVec.getOperand(1); + SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] + : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getNumOperands() > 1 && - InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = + (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. @@ -24068,18 +26247,16 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) - : InVec.getOperand(1); - Shuffle = DAG.getVectorShuffle(CurrentVT, dl, - InVec.getOperand(0), Shuffle, - &ShuffleMask[0]); + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, + ShuffleMask); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -24108,8 +26285,8 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } - if (((Subtarget->hasSSE1() && VT == MVT::f32) || - (Subtarget->hasSSE2() && VT == MVT::f64)) && + if (((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64)) && isa<ConstantSDNode>(N0.getOperand(1)) && N0.getOperand(0).getOpcode() == ISD::BITCAST && N0.getOperand(0).getOperand(0).getValueType() == VT) { @@ -24121,13 +26298,12 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index -/// generation and convert it from being a bunch of shuffles and extracts -/// into a somewhat faster sequence. For i686, the best sequence is apparently -/// storing the value and loading scalars back, while for x64 we should -/// use 64-bit extracts and shifts. -static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +/// Detect vector gather/scatter index generation and convert it from being a +/// bunch of shuffles and extracts into a somewhat faster sequence. +/// For i686, the best sequence is apparently storing the value and loading +/// scalars back, while for x64 we should use 64-bit extracts and shifts. +static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; @@ -24136,25 +26312,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32 && - InputVector.getValueType() == MVT::v2i32) { + InputVector.getValueType() == MVT::v2i32 && + isa<ConstantSDNode>(N->getOperand(1)) && + N->getConstantOperandVal(1) == 0) { + SDValue MMXSrc = InputVector.getNode()->getOperand(0); // The bitcast source is a direct mmx result. - SDValue MMXSrc = InputVector.getNode()->getOperand(0); if (MMXSrc.getValueType() == MVT::x86mmx) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - InputVector.getNode()->getOperand(0)); - - // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). - if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && - MMXSrc.getValueType() == MVT::i64) { - SDValue MMXSrcOp = MMXSrc.getOperand(0); - if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && - MMXSrcOp.getValueType() == MVT::v1i64 && - MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), MMXSrcOp.getOperand(0)); - } + return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } EVT VT = N->getValueType(0); @@ -24236,7 +26401,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo()); EVT ElementType = InputVector.getValueType().getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; @@ -24251,10 +26416,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); // Load the scalar. - Vals[i] = DAG.getLoad(ElementType, dl, Ch, - ScalarAddr, MachinePointerInfo(), - false, false, false, 0); - + Vals[i] = + DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo()); } } @@ -24272,55 +26435,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue -transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - SDValue Cond = N->getOperand(0); - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - - if (Cond.getOpcode() == ISD::SIGN_EXTEND) { - SDValue CondSrc = Cond->getOperand(0); - if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) - Cond = CondSrc->getOperand(0); - } - - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) - return SDValue(); - - // A vselect where all conditions and data are constants can be optimized into - // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). - if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && - ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) - return SDValue(); - - unsigned MaskValue = 0; - if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) - return SDValue(); - - MVT VT = N->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> ShuffleMask(NumElems, -1); - for (unsigned i = 0; i < NumElems; ++i) { - // Be sure we emit undef where we can. - if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) - ShuffleMask[i] = -1; - else - ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); - } - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) - return SDValue(); - return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); -} - -/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT -/// nodes. -static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// Do target-specific dag combines on SELECT and VSELECT nodes. +static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. @@ -24337,8 +26455,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && - (Subtarget->hasSSE2() || - (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { + (Subtarget.hasSSE2() || + (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); unsigned Opcode = 0; @@ -24476,7 +26594,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } EVT CondVT = Cond.getValueType(); - if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && + if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to @@ -24487,7 +26605,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16) && - !(Subtarget->hasBWI() && Subtarget->hasVLX())) { + !(Subtarget.hasBWI() && Subtarget.hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); @@ -24625,8 +26743,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. - ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || - (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { + ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); // Check if one of the arms of the VSELECT is a zero vector. If it's on the @@ -24730,25 +26848,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // We should generate an X86ISD::BLENDI from a vselect if its argument - // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of - // constants. This specific pattern gets generated when we split a - // selector for a 512 bit vector in a machine without AVX512 (but with - // 256-bit vectors), during legalization: - // - // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) - // - // Iff we find this pattern and the build_vectors are built from - // constants, we translate the vselect into a shuffle_vector that we - // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if ((N->getOpcode() == ISD::VSELECT || - N->getOpcode() == X86ISD::SHRUNKBLEND) && - !DCI.isBeforeLegalize() && !VT.is512BitVector()) { - SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); - if (Shuffle.getNode()) - return Shuffle; - } - // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the // condition so that the blends can use the high bit of each element and use @@ -24780,10 +26879,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. - if (VT.is128BitVector() && !Subtarget->hasSSE41()) + if (VT.is128BitVector() && !Subtarget.hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 - if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) + if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); @@ -24837,6 +26936,73 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Combine: +/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) +/// to: +/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) +/// i.e., reusing the EFLAGS produced by the LOCKed instruction. +/// Note that this is only legal for some op/cc combinations. +static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, + SelectionDAG &DAG) { + // This combine only operates on CMP-like nodes. + if (!(Cmp.getOpcode() == X86ISD::CMP || + (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) + return SDValue(); + + // This only applies to variations of the common case: + // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) + // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) + // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) + // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) + // Using the proper condcodes (see below), overflow is checked for. + + // FIXME: We can generalize both constraints: + // - XOR/OR/AND (if they were made to survive AtomicExpand) + // - LHS != 1 + // if the result is compared. + + SDValue CmpLHS = Cmp.getOperand(0); + SDValue CmpRHS = Cmp.getOperand(1); + + if (!CmpLHS.hasOneUse()) + return SDValue(); + + auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); + if (!CmpRHSC || CmpRHSC->getZExtValue() != 0) + return SDValue(); + + const unsigned Opc = CmpLHS.getOpcode(); + + if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) + return SDValue(); + + SDValue OpRHS = CmpLHS.getOperand(2); + auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS); + if (!OpRHSC) + return SDValue(); + + APInt Addend = OpRHSC->getAPIntValue(); + if (Opc == ISD::ATOMIC_LOAD_SUB) + Addend = -Addend; + + if (CC == X86::COND_S && Addend == 1) + CC = X86::COND_LE; + else if (CC == X86::COND_NS && Addend == 1) + CC = X86::COND_G; + else if (CC == X86::COND_G && Addend == -1) + CC = X86::COND_GE; + else if (CC == X86::COND_LE && Addend == -1) + CC = X86::COND_L; + else + return SDValue(); + + SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); + DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), + DAG.getUNDEF(CmpLHS.getValueType())); + DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); + return LockOp; +} + // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. @@ -24853,10 +27019,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { - // Quit if not CMP and SUB with its value result used. - if (Cmp.getOpcode() != X86ISD::CMP && - (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) - return SDValue(); + // This combine only operates on CMP-like nodes. + if (!(Cmp.getOpcode() == X86ISD::CMP || + (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) + return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) @@ -24890,6 +27056,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || + SetCC.getOpcode() == ISD::AssertZext || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; @@ -24897,7 +27064,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { OpIdx = 1; if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; - if (OpIdx == -1) + if (OpIdx < 0) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; @@ -25008,10 +27175,20 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, return true; } +/// Optimize an EFLAGS definition used according to the condition code \p CC +/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing +/// uses of chain values. +static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG) { + if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) + return R; + return combineSetCCAtomicArith(EFLAGS, CC, DAG); +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] -static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); // If the flag operand isn't dead, don't touch this CMOV. @@ -25034,15 +27211,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(Cond, CC); - if (Flags.getNode() && - // Extra check as FCMOV only supports a subset of X86 cond. - (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { - SDValue Ops[] = { FalseOp, TrueOp, - DAG.getConstant(CC, DL, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + // Try to simplify the EFLAGS and condition code operands. + // We can't always do this as FCMOV only supports a subset of X86 cond. + if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) { + if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { + SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), + Flags}; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + } } // If this is a select between two integer constants, try to do some @@ -25218,11 +27394,216 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformMulCombine - Optimize a single multiply with constant into two -/// in order to implement it with two cheaper instructions, e.g. -/// LEA + SHL, LEA + LEA. -static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +/// Different mul shrinking modes. +enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; + +static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { + EVT VT = N->getOperand(0).getValueType(); + if (VT.getScalarSizeInBits() != 32) + return false; + + assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); + unsigned SignBits[2] = {1, 1}; + bool IsPositive[2] = {false, false}; + for (unsigned i = 0; i < 2; i++) { + SDValue Opd = N->getOperand(i); + + // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to + // compute signbits for it separately. + if (Opd.getOpcode() == ISD::ANY_EXTEND) { + // For anyextend, it is safe to assume an appropriate number of leading + // sign/zero bits. + if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) + SignBits[i] = 25; + else if (Opd.getOperand(0).getValueType().getVectorElementType() == + MVT::i16) + SignBits[i] = 17; + else + return false; + IsPositive[i] = true; + } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) { + // All the operands of BUILD_VECTOR need to be int constant. + // Find the smallest value range which all the operands belong to. + SignBits[i] = 32; + IsPositive[i] = true; + for (const SDValue &SubOp : Opd.getNode()->op_values()) { + if (SubOp.isUndef()) + continue; + auto *CN = dyn_cast<ConstantSDNode>(SubOp); + if (!CN) + return false; + APInt IntVal = CN->getAPIntValue(); + if (IntVal.isNegative()) + IsPositive[i] = false; + SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits()); + } + } else { + SignBits[i] = DAG.ComputeNumSignBits(Opd); + if (Opd.getOpcode() == ISD::ZERO_EXTEND) + IsPositive[i] = true; + } + } + + bool AllPositive = IsPositive[0] && IsPositive[1]; + unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); + // When ranges are from -128 ~ 127, use MULS8 mode. + if (MinSignBits >= 25) + Mode = MULS8; + // When ranges are from 0 ~ 255, use MULU8 mode. + else if (AllPositive && MinSignBits >= 24) + Mode = MULU8; + // When ranges are from -32768 ~ 32767, use MULS16 mode. + else if (MinSignBits >= 17) + Mode = MULS16; + // When ranges are from 0 ~ 65535, use MULU16 mode. + else if (AllPositive && MinSignBits >= 16) + Mode = MULU16; + else + return false; + return true; +} + +/// When the operands of vector mul are extended from smaller size values, +/// like i8 and i16, the type of mul may be shrinked to generate more +/// efficient code. Two typical patterns are handled: +/// Pattern1: +/// %2 = sext/zext <N x i8> %1 to <N x i32> +/// %4 = sext/zext <N x i8> %3 to <N x i32> +// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul <N x i32> %2, %4 +/// +/// Pattern2: +/// %2 = zext/sext <N x i16> %1 to <N x i32> +/// %4 = zext/sext <N x i16> %3 to <N x i32> +/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul <N x i32> %2, %4 +/// +/// There are four mul shrinking modes: +/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// -128 to 128, and the scalar value range of %4 is also -128 to 128, +/// generate pmullw+sext32 for it (MULS8 mode). +/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// 0 to 255, and the scalar value range of %4 is also 0 to 255, +/// generate pmullw+zext32 for it (MULU8 mode). +/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, +/// generate pmullw+pmulhw for it (MULS16 mode). +/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, +/// generate pmullw+pmulhuw for it (MULU16 mode). +static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // pmulld is supported since SSE41. It is better to use pmulld + // instead of pmullw+pmulhw. + if (Subtarget.hasSSE41()) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(N, DAG, Mode)) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getOperand(0).getValueType(); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); + EVT ReducedVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); + // Shrink the operands of mul. + SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); + SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); + + if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) { + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) { + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); + } else { + MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements(); + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + } else { + // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want + // to legalize the mul explicitly because implicit legalization for type + // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack + // instructions which will not exist when we explicitly legalize it by + // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with + // <4 x i16> undef). + // + // Legalize the operands of mul. + SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(), + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (Mode == MULU8 || Mode == MULS8) { + // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower + // part is needed. + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // convert the type of mul result to VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } else { + // Generate the lower and higher part of mul: pmulhw/pmulhuw. For + // MULU16/MULS16, both parts are needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + OpsVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. Make sure the type of mul result is VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + } +} + +/// Optimize a single multiply with constant into two operations in order to +/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. +static SDValue combineMul(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalize() && VT.isVector()) + return reduceVMULWidth(N, DAG, Subtarget); + // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -25230,7 +27611,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); @@ -25307,7 +27687,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); @@ -25320,7 +27700,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - APInt ShAmt = N1C->getAPIntValue(); + const APInt &ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if @@ -25367,7 +27747,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); @@ -25424,11 +27804,11 @@ static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && - (!Subtarget->hasInt256() || + (!Subtarget.hasInt256() || (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); @@ -25436,7 +27816,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { - APInt ShiftAmt = AmtSplat->getAPIntValue(); + const APInt &ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getSimpleVT().getVectorElementType().getSizeInBits(); @@ -25451,16 +27831,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformShiftCombine - Combine shifts. -static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineShift(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (N->getOpcode() == ISD::SHL) - if (SDValue V = PerformSHLCombine(N, DAG)) + if (SDValue V = combineShiftLeft(N, DAG)) return V; if (N->getOpcode() == ISD::SRA) - if (SDValue V = PerformSRACombine(N, DAG)) + if (SDValue V = combineShiftRightAlgebraic(N, DAG)) return V; // Try to fold this logical shift into a zero vector. @@ -25471,17 +27850,17 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return SDValue(); } -// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) -// where both setccs reference the same FP CMP, and rewrite for CMPEQSS -// and friends. Likewise for OR -> CMPNEQSS. -static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs +/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for +/// OR -> CMPNEQSS. +static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. - if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); @@ -25530,7 +27909,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; - if (Subtarget->hasAVX512()) { + if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); @@ -25547,7 +27926,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; - if (is64BitFP && !Subtarget->is64Bit()) { + if (is64BitFP && !Subtarget.is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones of all zeroes, we don't need all the @@ -25574,34 +27953,47 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector -/// so it can be folded inside ANDNP. -static bool CanFoldXORWithAllOnes(const SDNode *N) { +/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). +static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::AND); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); - // Match direct AllOnes for 128 and 256-bit vectors - if (ISD::isBuildVectorAllOnes(N)) - return true; + if (VT != MVT::v2i64 && VT != MVT::v4i64 && + VT != MVT::v8i64 && VT != MVT::v16i32 && + VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX + return SDValue(); - // Look through a bit convert. - if (N->getOpcode() == ISD::BITCAST) - N = N->getOperand(0).getNode(); - - // Sometimes the operand may come from a insert_subvector building a 256-bit - // allones vector - if (VT.is256BitVector() && - N->getOpcode() == ISD::INSERT_SUBVECTOR) { - SDValue V1 = N->getOperand(0); - SDValue V2 = N->getOperand(1); - - if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && - V1.getOperand(0).getOpcode() == ISD::UNDEF && - ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && - ISD::isBuildVectorAllOnes(V2.getNode())) - return true; - } + // Canonicalize XOR to the left. + if (N1.getOpcode() == ISD::XOR) + std::swap(N0, N1); - return false; + if (N0.getOpcode() != ISD::XOR) + return SDValue(); + + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + + N01 = peekThroughBitcasts(N01); + + // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an + // insert_subvector building a 256-bit AllOnes vector. + if (!ISD::isBuildVectorAllOnes(N01.getNode())) { + if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + SDValue V1 = N01->getOperand(0); + SDValue V2 = N01->getOperand(1); + if (V1.getOpcode() != ISD::INSERT_SUBVECTOR || + !V1.getOperand(0).isUndef() || + !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) || + !ISD::isBuildVectorAllOnes(V2.getNode())) + return SDValue(); + } + return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized @@ -25610,7 +28002,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) { // some of the transition sequences. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.is256BitVector()) return SDValue(); @@ -25660,8 +28052,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), SDValue(RHSConstSplat, 0)); - SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); - N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); + N1 = DAG.getSplatBuildVector(WideVT, DL, N1); } else if (RHSTrunc) { N1 = N1->getOperand(0); } @@ -25687,9 +28078,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, } } -static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, +static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); @@ -25705,8 +28096,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // The other side of the AND should be a splat of 2^C, where C // is the number of bits in the source type. - if (N1.getOpcode() == ISD::BITCAST) - N1 = N1.getOperand(0); + N1 = peekThroughBitcasts(N1); if (N1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); @@ -25715,10 +28105,11 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, EVT SrcType = Shuffle->getValueType(0); // We expect a single-source shuffle - if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) + if (!Shuffle->getOperand(1)->isUndef()) return SDValue(); unsigned SrcSize = SrcType.getScalarSizeInBits(); + unsigned NumElems = SrcType.getVectorNumElements(); APInt SplatValue, SplatUndef; unsigned SplatBitSize; @@ -25742,7 +28133,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // the source and dest type. unsigned ZextRatio = ResSize / SrcSize; bool IsZext = true; - for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { + for (unsigned i = 0; i != NumElems; ++i) { if (i % ZextRatio) { if (Shuffle->getMaskElt(i) > 0) { // Expected undef @@ -25765,8 +28156,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero // (instead of undef) where the k elements come from the zero vector. SmallVector<int, 8> Mask; - unsigned NumElems = SrcType.getVectorNumElements(); - for (unsigned i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) if (i % ZextRatio) Mask.push_back(NumElems); else @@ -25781,7 +28171,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { unsigned FPOpcode = ISD::DELETED_NODE; if (N->getOpcode() == ISD::AND) FPOpcode = X86ISD::FAND; @@ -25798,8 +28188,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); SDLoc DL(N); if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - ((Subtarget->hasSSE1() && VT == MVT::i32) || - (Subtarget->hasSSE2() && VT == MVT::i64))) { + ((Subtarget.hasSSE1() && VT == MVT::i32) || + (Subtarget.hasSSE2() && VT == MVT::i64))) { SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); EVT N00Type = N00.getValueType(); @@ -25812,21 +28202,63 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is +/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to +/// eliminate loading the vector constant mask value. This relies on the fact +/// that a PCMP always creates an all-ones or all-zeros bitmask per element. +static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); + SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + + // TODO: Use AssertSext to mark any nodes that have the property of producing + // all-ones or all-zeros. Then check for that node rather than particular + // opcodes. + if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) + return SDValue(); + + // The existence of the PCMP node guarantees that we have the required SSE2 or + // AVX2 for a shift of this vector type, but there is no vector shift by + // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the + // masked compare nodes, so they should not make it here. + EVT VT0 = Op0.getValueType(); + EVT VT1 = Op1.getValueType(); + unsigned EltBitWidth = VT0.getScalarType().getSizeInBits(); + if (VT0 != VT1 || EltBitWidth == 8) + return SDValue(); + + assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256); + + APInt SplatVal; + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) + return SDValue(); + + SDLoc DL(N); + SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); + SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); + return DAG.getBitcast(N->getValueType(0), Shift); +} + +static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) + if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget)) return Zext; - if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; + if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) + return R; + + if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) + return ShiftRight; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -25834,143 +28266,176 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) - if (VT == MVT::i32 || VT == MVT::i64) { - // Check for BEXTR. - if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && - (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { - ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); - ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (MaskNode && ShiftNode) { - uint64_t Mask = MaskNode->getZExtValue(); - uint64_t Shift = ShiftNode->getZExtValue(); - if (isMask_64(Mask)) { - uint64_t MaskSize = countPopulation(Mask); - if (Shift + MaskSize <= VT.getSizeInBits()) - return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), - DAG.getConstant(Shift | (MaskSize << 8), DL, - VT)); - } - } - } // BEXTR + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + if (!Subtarget.hasBMI() && !Subtarget.hasTBM()) return SDValue(); + if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL) + return SDValue(); + + ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (MaskNode && ShiftNode) { + uint64_t Mask = MaskNode->getZExtValue(); + uint64_t Shift = ShiftNode->getZExtValue(); + if (isMask_64(Mask)) { + uint64_t MaskSize = countPopulation(Mask); + if (Shift + MaskSize <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), + DAG.getConstant(Shift | (MaskSize << 8), DL, + VT)); + } } + return SDValue(); +} - // Want to form ANDNP nodes: - // 1) In the hopes of then easily combining them with OR and AND nodes - // to form PBLEND/PSIGN. - // 2) To match ANDN packed intrinsics - if (VT != MVT::v2i64 && VT != MVT::v4i64) +// Try to fold: +// (or (and (m, y), (pandn m, x))) +// into: +// (vselect m, x, y) +// As a special case, try to fold: +// (or (and (m, (sub 0, x)), (pandn m, x))) +// into: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) + return SDValue(); + assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); + + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + + if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); - // Check LHS for vnot - if (N0.getOpcode() == ISD::XOR && - //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) - CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) - return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); - // Check RHS for vnot - if (N1.getOpcode() == ISD::XOR && - //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) - CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) - return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); + // Check to see if the mask appeared in both the AND and ANDNP. + if (!Y.getNode()) + return SDValue(); - return SDValue(); + // Validate that X, Y, and Mask are bitcasts, and see through them. + Mask = peekThroughBitcasts(Mask); + X = peekThroughBitcasts(X); + Y = peekThroughBitcasts(Y); + + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned SraAmt = ~0; + if (Mask.getOpcode() == ISD::SRA) { + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); + } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SDValue SraC = Mask.getOperand(1); + SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + } + if ((SraAmt + 1) != EltBits) + return SDValue(); + + SDLoc DL(N); + + // Try to match: + // (or (and (M, (sub 0, X)), (pandn M, X))) + // which is a special case of vselect: + // (vselect M, (sub 0, X), X) + // Per: + // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate + // We know that, if fNegate is 0 or 1: + // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) + // + // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: + // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) + // ( M ? -X : X) == ((X ^ M ) + (M & 1)) + // This lets us transform our vselect to: + // (add (xor X, M), (and M, 1)) + // And further to: + // (sub (xor X, M), M) + if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + + if (V) { + assert(EltBits == 8 || EltBits == 16 || EltBits == 32); + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + return DAG.getBitcast(VT, + DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); + } + } + + // PBLENDVB is only available on SSE 4.1. + if (!Subtarget.hasSSE41()) + return SDValue(); + + MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getBitcast(BlendVT, X); + Y = DAG.getBitcast(BlendVT, Y); + Mask = DAG.getBitcast(BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getBitcast(VT, Mask); } -static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineOr(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) + return R; + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - // look for psign/blend - if (VT == MVT::v2i64 || VT == MVT::v4i64) { - if (!Subtarget->hasSSSE3() || - (VT == MVT::v4i64 && !Subtarget->hasInt256())) - return SDValue(); - - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) - std::swap(N0, N1); - // or (and (m, y), (pandn m, x)) - if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { - SDValue Mask = N1.getOperand(0); - SDValue X = N1.getOperand(1); - SDValue Y; - if (N0.getOperand(0) == Mask) - Y = N0.getOperand(1); - if (N0.getOperand(1) == Mask) - Y = N0.getOperand(0); - - // Check to see if the mask appeared in both the AND and ANDNP and - if (!Y.getNode()) - return SDValue(); - - // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - // Look through mask bitcast. - if (Mask.getOpcode() == ISD::BITCAST) - Mask = Mask.getOperand(0); - if (X.getOpcode() == ISD::BITCAST) - X = X.getOperand(0); - if (Y.getOpcode() == ISD::BITCAST) - Y = Y.getOperand(0); - - EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b - unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); - unsigned SraAmt = ~0; - if (Mask.getOpcode() == ISD::SRA) { - if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) - if (auto *AmtConst = AmtBV->getConstantSplatNode()) - SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) { - SDValue SraC = Mask.getOperand(1); - SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - } - if ((SraAmt + 1) != EltBits) - return SDValue(); - - SDLoc DL(N); - - // Now we know we at least have a plendvb with the mask val. See if - // we can form a psignb/w/d. - // psign = x.type == y.type == mask.type && y = sub(0, x); - if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && - ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && - X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { - assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && - "Unsupported VT for PSIGN"); - Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); - return DAG.getBitcast(VT, Mask); - } - // PBLENDVB only available on SSE 4.1 - if (!Subtarget->hasSSE41()) - return SDValue(); - - MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; - - X = DAG.getBitcast(BlendVT, X); - Y = DAG.getBitcast(BlendVT, Y); - Mask = DAG.getBitcast(BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); - return DAG.getBitcast(VT, Mask); - } - } - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); @@ -25982,7 +28447,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget->isSHLDSlow()) + if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) @@ -26040,7 +28505,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, } // Generate NEG and CMOV for integer abs. -static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Since X86 does not have CMOV for 8-bit integer, we don't convert @@ -26073,13 +28538,14 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Try to turn tests against the signbit in the form of: -// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) -// into: -// SETGT(X, -1) +/// Try to turn tests against the signbit in the form of: +/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +/// into: +/// SETGT(X, -1) static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { - // This is only worth doing if the output type is i8. - if (N->getValueType(0) != MVT::i8) + // This is only worth doing if the output type is i8 or i1. + EVT ResultType = N->getValueType(0); + if (ResultType != MVT::i8 && ResultType != MVT::i1) return SDValue(); SDValue N0 = N->getOperand(0); @@ -26114,22 +28580,78 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue ShiftOp = Shift.getOperand(0); EVT ShiftOpTy = ShiftOp.getValueType(); - SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), ResultType); + SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + if (SetCCResultType != ResultType) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); return Cond; } -static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, +/// Turn vector tests of the signbit in the form of: +/// xor (sra X, elt_size(X)-1), -1 +/// into: +/// pcmpgt X, -1 +/// +/// This should be called before type legalization because the pattern may not +/// persist after that. +static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isSimple()) + return SDValue(); + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; + case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; + } + + // There must be a shift right algebraic before the xor, and the xor must be a + // 'not' operation. + SDValue Shift = N->getOperand(0); + SDValue Ones = N->getOperand(1); + if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || + !ISD::isBuildVectorAllOnes(Ones.getNode())) + return SDValue(); + + // The shift should be smearing the sign bit across each vector element. + auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1)); + if (!ShiftBV) + return SDValue(); + + EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); + auto *ShiftAmt = ShiftBV->getConstantSplatNode(); + if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. We don't use the more obvious + // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. + return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); +} + +static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) + return Cmp; + if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; - if (Subtarget->hasCMov()) - if (SDValue RV = performIntegerAbsCombine(N, DAG)) + if (Subtarget.hasCMov()) + if (SDValue RV = combineIntegerAbs(N, DAG)) return RV; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) @@ -26142,7 +28664,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, - const X86Subtarget *Subtarget, SDLoc DL) { + const X86Subtarget &Subtarget, + const SDLoc &DL) { if (!VT.isVector() || !VT.isSimple()) return SDValue(); EVT InVT = In.getValueType(); @@ -26159,10 +28682,12 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) return SDValue(); - if (Subtarget->hasAVX512()) { + if (!Subtarget.hasSSE2()) + return SDValue(); + if (Subtarget.hasAVX512()) { if (VT.getSizeInBits() > 512) return SDValue(); - } else if (Subtarget->hasAVX2()) { + } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) return SDValue(); } else { @@ -26221,10 +28746,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, Operands[0].getOperand(0).getValueType() == VT) { // The pattern is detected. Subtract one from the constant vector, then // demote it and emit X86ISD::AVG instruction. - SDValue One = DAG.getConstant(1, DL, InScalarVT); - SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, - SmallVector<SDValue, 8>(NumElems, One)); - Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + SDValue VecOnes = DAG.getConstant(1, DL, InVT); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), Operands[1]); @@ -26258,10 +28781,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); } -/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. -static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { LoadSDNode *Ld = cast<LoadSDNode>(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); @@ -26283,41 +28805,180 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = - DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); - SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - Alignment); - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - std::min(16U, Alignment)); + SDValue Load1 = + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Alignment, Ld->getMemOperand()->getFlags()); + + Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); + SDValue Load2 = + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + std::min(16U, Alignment), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getUNDEF(RegVT); - NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); - NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl); return DCI.CombineTo(N, NewVec, TF, true); } return SDValue(); } -/// PerformMLOADCombine - Resolve extending loads -static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// If V is a build vector of boolean constants and exactly one of those +/// constants is true, return the operand index of that true element. +/// Otherwise, return -1. +static int getOneTrueElt(SDValue V) { + // This needs to be a build vector of booleans. + // TODO: Checking for the i1 type matches the IR definition for the mask, + // but the mask check could be loosened to i8 or other types. That might + // also require checking more than 'allOnesValue'; eg, the x86 HW + // instructions only require that the MSB is set for each mask element. + // The ISD::MSTORE comments/definition do not specify how the mask operand + // is formatted. + auto *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) + return -1; + + int TrueIndex = -1; + unsigned NumElts = BV->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + const SDValue &Op = BV->getOperand(i); + if (Op.isUndef()) + continue; + auto *ConstNode = dyn_cast<ConstantSDNode>(Op); + if (!ConstNode) + return -1; + if (ConstNode->getAPIntValue().isAllOnesValue()) { + // If we already found a one, this is too many. + if (TrueIndex >= 0) + return -1; + TrueIndex = i; + } + } + return TrueIndex; +} + +/// Given a masked memory load/store operation, return true if it has one mask +/// bit set. If it has one mask bit set, then also return the memory address of +/// the scalar element to load/store, the vector index to insert/extract that +/// scalar element, and the alignment for the scalar memory access. +static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, + SelectionDAG &DAG, SDValue &Addr, + SDValue &Index, unsigned &Alignment) { + int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); + if (TrueMaskElt < 0) + return false; + + // Get the address of the one scalar element that is specified by the mask + // using the appropriate offset from the base pointer. + EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); + Addr = MaskedOp->getBasePtr(); + if (TrueMaskElt != 0) { + unsigned Offset = TrueMaskElt * EltVT.getStoreSize(); + Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp)); + } + + Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); + Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize()); + return true; +} + +/// If exactly one element of the mask is set for a non-extending masked load, +/// it is a scalar load and vector insert. +/// Note: It is expected that the degenerate cases of an all-zeros or all-ones +/// mask have already been optimized in IR, so we don't bother with those here. +static SDValue +reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. + // However, some target hooks may need to be added to know when the transform + // is profitable. Endianness would also have to be considered. + + SDValue Addr, VecIndex; + unsigned Alignment; + if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment)) + return SDValue(); + + // Load the one scalar element that is specified by the mask using the + // appropriate offset from the base pointer. + SDLoc DL(ML); + EVT VT = ML->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + SDValue Load = + DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(), + Alignment, ML->getMemOperand()->getFlags()); + + // Insert the loaded element into the appropriate place in the vector. + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(), + Load, VecIndex); + return DCI.CombineTo(ML, Insert, Load.getValue(1), true); +} + +static SDValue +combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) + return SDValue(); + + SDLoc DL(ML); + EVT VT = ML->getValueType(0); + + // If we are loading the first and last elements of a vector, it is safe and + // always faster to load the whole vector. Replace the masked load with a + // vector load and select. + unsigned NumElts = VT.getVectorNumElements(); + BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask()); + bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); + bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); + if (LoadFirstElt && LoadLastElt) { + SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), + ML->getMemOperand()); + SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0()); + return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); + } + + // Convert a masked load with a constant mask into a masked load and a select. + // This allows the select operation to use a faster kind of select instruction + // (for example, vblendvps -> vblendps). + + // Don't try this if the pass-through operand is already undefined. That would + // cause an infinite loop because that's what we're about to create. + if (ML->getSrc0().isUndef()) + return SDValue(); + + // The new masked load has an undef pass-through operand. The select uses the + // original pass-through operand. + SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), + ML->getMask(), DAG.getUNDEF(VT), + ML->getMemoryVT(), ML->getMemOperand(), + ML->getExtensionType()); + SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0()); + + return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); +} + +static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { + if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) + return ScalarLoad; + // TODO: Do some AVX512 subsets benefit from this transform? + if (!Subtarget.hasAVX512()) + if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) + return Blend; + } + if (Mld->getExtensionType() != ISD::SEXTLOAD) return SDValue(); + // Resolve extending loads. EVT VT = Mld->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); EVT LdVT = Mld->getMemoryVT(); @@ -26326,21 +28987,21 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, assert(LdVT != VT && "Cannot extend to the same type"); unsigned ToSz = VT.getVectorElementType().getSizeInBits(); unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); - // From, To sizes and ElemCount must be pow of two + // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"); unsigned SizeRatio = ToSz / FromSz; assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); - // Create a type on which we perform the shuffle + // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), LdVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - // Convert Src0 value + // Convert Src0 value. SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); - if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { + if (!Mld->getSrc0().isUndef()) { SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -26349,13 +29010,13 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, - DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), ShuffleVec); } - // Prepare the new mask + // Prepare the new mask. SDValue NewMask; SDValue Mask = Mld->getMask(); if (Mask.getValueType() == VT) { - // Mask and original value have the same type + // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) @@ -26364,9 +29025,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), - &ShuffleVec[0]); - } - else { + ShuffleVec); + } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); @@ -26390,13 +29050,41 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } -/// PerformMSTORECombine - Resolve truncating stores -static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + +/// If exactly one element of the mask is set for a non-truncating masked store, +/// it is a vector extract and scalar store. +/// Note: It is expected that the degenerate cases of an all-zeros or all-ones +/// mask have already been optimized in IR, so we don't bother with those here. +static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, + SelectionDAG &DAG) { + // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. + // However, some target hooks may need to be added to know when the transform + // is profitable. Endianness would also have to be considered. + + SDValue Addr, VecIndex; + unsigned Alignment; + if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment)) + return SDValue(); + + // Extract the one scalar element that is actually being stored. + SDLoc DL(MS); + EVT VT = MS->getValue().getValueType(); + EVT EltVT = VT.getVectorElementType(); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + MS->getValue(), VecIndex); + + // Store that element at the appropriate offset from the base pointer. + return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(), + Alignment, MS->getMemOperand()->getFlags()); +} + +static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); if (!Mst->isTruncatingStore()) - return SDValue(); + return reduceMaskedStoreToScalarStore(Mst, DAG); + // Resolve truncating stores. EVT VT = Mst->getValue().getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT StVT = Mst->getMemoryVT(); @@ -26415,7 +29103,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, if (TLI.isTruncStoreLegal(VT, StVT)) return SDValue(); - // From, To sizes and ElemCount must be pow of two + // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. @@ -26426,7 +29114,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); - // Create a type on which we perform the shuffle + // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); @@ -26443,12 +29131,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + ShuffleVec); SDValue NewMask; SDValue Mask = Mst->getMask(); if (Mask.getValueType() == VT) { - // Mask and original value have the same type + // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -26456,9 +29144,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), - &ShuffleVec[0]); - } - else { + ShuffleVec); + } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); @@ -26479,9 +29166,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, Mst->getBasePtr(), NewMask, StVT, Mst->getMemOperand(), false); } -/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. -static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + +static SDValue combineStore(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { StoreSDNode *St = cast<StoreSDNode>(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); @@ -26496,26 +29183,24 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AddressSpace, Alignment, &Fast) && !Fast) { + AddressSpace, Alignment, &Fast) && + !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); - SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); + SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); - SDValue Stride = - DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); - - SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), Alignment); - SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), - std::min(16U, Alignment)); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); + + SDValue Ch0 = + DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), + Alignment, St->getMemOperand()->getFlags()); + SDValue Ch1 = + DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), + std::min(16U, Alignment), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } @@ -26526,12 +29211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. - SDValue Avg = - detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); - if (Avg.getNode()) + if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, + Subtarget, dl)) return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); @@ -26543,7 +29227,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegal(VT, StVT)) + if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two @@ -26573,7 +29257,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. @@ -26595,8 +29279,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, - TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. @@ -26604,10 +29286,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i, dl)); - SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Ch = + DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); Chains.push_back(Ch); } @@ -26626,9 +29308,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = - !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2(); + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); if ((VT.isVector() || - (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && + (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa<LoadSDNode>(St->getValue()) && !cast<LoadSDNode>(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -26667,58 +29349,49 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. - if (Subtarget->is64Bit() || F64IsLegal) { - MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + if (Subtarget.is64Bit() || F64IsLegal) { + MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - Ld->getAlignment()); + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); SDValue NewChain = NewLd.getValue(1); - if (TokenFactorIndex != -1) { + if (TokenFactorIndex >= 0) { Ops.push_back(NewChain); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), - St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), - St->getAlignment()); + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); } // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, - DAG.getConstant(4, LdDL, MVT::i32)); + SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, - Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), - MinAlign(Ld->getAlignment(), 4)); + MinAlign(Ld->getAlignment(), 4), + Ld->getMemOperand()->getFlags()); SDValue NewChain = LoLd.getValue(1); - if (TokenFactorIndex != -1) { + if (TokenFactorIndex >= 0) { Ops.push_back(LoLd); Ops.push_back(HiLd); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); - HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, - DAG.getConstant(4, StDL, MVT::i32)); - - SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, - St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), - St->getAlignment()); - SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, - St->getPointerInfo().getWithOffset(4), - St->isVolatile(), - St->isNonTemporal(), - MinAlign(St->getAlignment(), 4)); + HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); + + SDValue LoSt = + DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + SDValue HiSt = DAG.getStore( + NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), + MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } @@ -26728,7 +29401,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // to get past legalization. The execution dependencies fixup pass will // choose the optimal machine instruction for the store if this really is // an integer or v2f32 rather than an f64. - if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() && + if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() && St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue OldExtract = St->getOperand(1); SDValue ExtOp0 = OldExtract.getOperand(0); @@ -26738,8 +29411,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); } return SDValue(); @@ -26798,14 +29471,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { SDValue A, B; SmallVector<int, 16> LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) + if (!LHS.getOperand(0).isUndef()) A = LHS.getOperand(0); - if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) + if (!LHS.getOperand(1).isUndef()) B = LHS.getOperand(1); ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { - if (LHS.getOpcode() != ISD::UNDEF) + if (!LHS.isUndef()) A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; @@ -26816,14 +29489,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { SDValue C, D; SmallVector<int, 16> RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) + if (!RHS.getOperand(0).isUndef()) C = RHS.getOperand(0); - if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) + if (!RHS.getOperand(1).isUndef()) D = RHS.getOperand(1); ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { - if (RHS.getOpcode() != ISD::UNDEF) + if (!RHS.isUndef()) C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; @@ -26871,33 +29544,22 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return true; } -/// Do target-specific dag combines on floating point adds. -static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, true)) - return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); - return SDValue(); -} - -/// Do target-specific dag combines on floating point subs. -static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +/// Do target-specific dag combines on floating-point adds/subs. +static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + bool IsFadd = N->getOpcode() == ISD::FADD; + assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); - // Try to synthesize horizontal subs from subs of shuffles. - if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, false)) - return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); + // Try to synthesize horizontal add/sub from adds/subs of shuffles. + if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + isHorizontalBinOp(LHS, RHS, IsFadd)) { + auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; + return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); + } return SDValue(); } @@ -26916,13 +29578,11 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, // First, use mask to unset all bits that won't appear in the result. assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && "OutSVT can only be either i8 or i16."); - SDValue MaskVal = - DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); - SDValue MaskVec = DAG.getNode( - ISD::BUILD_VECTOR, DL, InVT, - SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); + APInt Mask = + APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); + SDValue MaskVal = DAG.getConstant(Mask, DL, InVT); for (auto &Reg : Regs) - Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg); MVT UnpackedVT, PackedVT; if (OutSVT == MVT::i8) { @@ -26938,7 +29598,7 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); j < e; j *= 2, RegNum /= 2) { for (unsigned i = 0; i < RegNum; i++) - Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]); for (unsigned i = 0; i < RegNum / 2; i++) Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], Regs[i * 2 + 1]); @@ -26990,7 +29650,7 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, /// element that is extracted from a vector and then truncated, and it is /// diffcult to do this optimization based on them. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { EVT OutVT = N->getValueType(0); if (!OutVT.isVector()) return SDValue(); @@ -27005,7 +29665,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on // SSE2, and we need to take care of it specially. // AVX512 provides vpmovdb. - if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + if (!Subtarget.hasSSE2() || Subtarget.hasAVX2()) return SDValue(); EVT OutSVT = OutVT.getVectorElementType(); @@ -27016,7 +29676,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. - if (Subtarget->hasSSSE3() && NumElems == 8 && + if (Subtarget.hasSSSE3() && NumElems == 8 && ((OutSVT == MVT::i8 && InSVT != MVT::i64) || (InSVT == MVT::i32 && OutSVT == MVT::i16))) return SDValue(); @@ -27026,20 +29686,17 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, // Split a long vector into vectors of legal type. unsigned RegNum = InVT.getSizeInBits() / 128; SmallVector<SDValue, 8> SubVec(RegNum); - if (InSVT == MVT::i32) { - for (unsigned i = 0; i < RegNum; i++) - SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(i * 4, DL)); - } else { - for (unsigned i = 0; i < RegNum; i++) - SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, - DAG.getIntPtrConstant(i * 2, DL)); - } + unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); + EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); - // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, + DAG.getIntPtrConstant(i * NumSubRegElts, DL)); + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to // truncate 2 x v4i32 to v8i16. - if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + if (Subtarget.hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DAG, SubVec); else if (InSVT == MVT::i32) return combineVectorTruncationWithPACKSS(N, DAG, SubVec); @@ -27047,20 +29704,30 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + SDLoc DL(N); + // Try to detect AVG pattern first. - SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, - Subtarget, SDLoc(N)); - if (Avg.getNode()) + if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // The bitcast source is a direct mmx result. + // Detect bitcasts between i32 to x86mmx + if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { + SDValue BCSrc = Src.getOperand(0); + if (BCSrc.getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); + } + return combineVectorTruncation(N, DAG, Subtarget); } /// Do target-specific dag combines on floating point negations. -static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); SDValue Arg = N->getOperand(0); @@ -27074,7 +29741,7 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && - Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); @@ -27102,17 +29769,17 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - if (VT.is512BitVector() && !Subtarget->hasDQI()) { + if (VT.is512BitVector() && !Subtarget.hasDQI()) { // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. // These logic operations may be executed in the integer domain. SDLoc dl(N); MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); - SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); unsigned IntOpcode = 0; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected FP logic op"); @@ -27122,13 +29789,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); - return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + return DAG.getBitcast(VT, IntOp); } return SDValue(); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. -static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -27145,7 +29812,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. -static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // Only perform optimizations if UnsafeMath is used. @@ -27165,9 +29832,9 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } -static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - if (Subtarget->useSoftFloat()) +static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Subtarget.useSoftFloat()) return SDValue(); // TODO: Check for global or instruction-level "nnan". In that case, we @@ -27176,9 +29843,9 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, // should be an optional swap and FMAX/FMIN. EVT VT = N->getValueType(0); - if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) return SDValue(); // This takes at least 3 instructions, so favor a library call when operating @@ -27222,8 +29889,8 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, } /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -27238,8 +29905,8 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -27253,9 +29920,8 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, return lowerX86FPLogicOp(N, DAG, Subtarget); } -static SDValue PerformBTCombine(SDNode *N, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue combineBT(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. SDValue Op1 = N->getOperand(1); if (Op1.hasOneUse()) { @@ -27272,21 +29938,19 @@ static SDValue PerformBTCombine(SDNode *N, return SDValue(); } -static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { - SDValue Op = N->getOperand(0); - if (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); +static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) { + SDValue Op = peekThroughBitcasts(N->getOperand(0)); EVT VT = N->getValueType(0), OpVT = Op.getValueType(); if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); + return DAG.getBitcast(VT, Op); } return SDValue(); } -static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); @@ -27307,7 +29971,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. - if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); @@ -27325,7 +29989,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, /// to combine math ops, use an LEA, or use a complex addressing mode. This can /// eliminate extend, add, and shift instructions. static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget &Subtarget) { // TODO: This should be valid for other integer types. EVT VT = Sext->getValueType(0); if (VT != MVT::i64) @@ -27397,14 +30061,106 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { return R.getValue(1); } -static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or +/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating +/// with UNDEFs) of the input to vectors of the same size as the target type +/// which then extends the lowest elements. +static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) + return SDValue(); + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + if (!Subtarget.hasSSE2()) + return SDValue(); + SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); + + // Input type must be a vector and we must be extending legal integer types. + if (!VT.isVector()) + return SDValue(); + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + // On AVX2+ targets, if the input/output types are both legal then we will be + // able to use SIGN_EXTEND/ZERO_EXTEND directly. + if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + + SDLoc DL(N); + + auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { + EVT InVT = N.getValueType(); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), + Size / InVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), + DAG.getUNDEF(InVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + }; + + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + + // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to + // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. + // Also use this if we don't have SSE41 to allow the legalizer do its job. + if (!Subtarget.hasSSE41() || VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.hasInt256())) { + SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); + return Opcode == ISD::SIGN_EXTEND + ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) + : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); + } + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { + unsigned NumVecs = VT.getSizeInBits() / 128; + unsigned NumSubElts = 128 / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendVecSize(DL, SrcVec, 128); + SrcVec = Opcode == ISD::SIGN_EXTEND + ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) + : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + } + + return SDValue(); +} + +static SDValue combineSext(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); SDLoc DL(N); if (SDValue DivRem8 = getDivRem8(N, DAG)) @@ -27414,70 +30170,16 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } - if (VT.isVector() && Subtarget->hasSSE2()) { - auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { - EVT InVT = N.getValueType(); - EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), - Size / InVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), - DAG.getUNDEF(InVT)); - Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); - }; - - // If target-size is less than 128-bits, extend to a type that would extend - // to 128 bits, extend that and extract the original target vector. - if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && - (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && - (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { - unsigned Scale = 128 / VT.getSizeInBits(); - EVT ExVT = - EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); - SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); - SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, - DAG.getIntPtrConstant(0, DL)); - } - - // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG - // which ensures lowering to X86ISD::VSEXT (pmovsx*). - if (VT.getSizeInBits() == 128 && - (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && - (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { - SDValue ExOp = ExtendVecSize(DL, N0, 128); - return DAG.getSignExtendVectorInReg(ExOp, DL, VT); - } - - // On pre-AVX2 targets, split into 128-bit nodes of - // ISD::SIGN_EXTEND_VECTOR_INREG. - if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && - (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && - (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { - unsigned NumVecs = VT.getSizeInBits() / 128; - unsigned NumSubElts = 128 / SVT.getSizeInBits(); - EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); - EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - - SmallVector<SDValue, 8> Opnds; - for (unsigned i = 0, Offset = 0; i != NumVecs; - ++i, Offset += NumSubElts) { - SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, - DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendVecSize(DL, SrcVec, 128); - SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); - Opnds.push_back(SrcVec); - } - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); - } - } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; - if (Subtarget->hasAVX() && VT.is256BitVector()) + if (Subtarget.hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -27487,8 +30189,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget* Subtarget) { +static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -27497,7 +30199,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -27526,9 +30228,9 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, dl, VT, A, B, C); } -static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineZext(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because @@ -27563,6 +30265,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, } } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -27573,10 +30278,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Optimize x == -y --> x+y == 0 -// x != -y --> x+y != 0 -static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget* Subtarget) { +/// Optimize x == -y --> x+y == 0 +/// x != -y --> x+y != 0 +static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -27631,10 +30336,15 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, } } + // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization + // via legalization because v4i32 is not a legal type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) + return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); + return SDValue(); } -static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Gather and Scatter instructions use k-registers for masks. The type of // the masks is v*i1. So the mask will be truncated anyway. @@ -27648,11 +30358,11 @@ static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// Helper function of performSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, - MVT VT) { +static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, + SelectionDAG &DAG, MVT VT) { if (VT == MVT::i8) return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, @@ -27667,9 +30377,9 @@ static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT -static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -27698,7 +30408,8 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + // Try to simplify the EFLAGS and condition code operands. + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } @@ -27706,28 +30417,28 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Optimize branch condition evaluation. -// -static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// Optimize branch condition evaluation. +static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); - if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + // Try to simplify the EFLAGS and condition code operands. + // Make sure to not keep references to operands, as combineSetCCEFLAGS can + // RAUW them under us. + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); - return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, - Flags); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), + N->getOperand(1), Cond, Flags); } return SDValue(); } -static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, - SelectionDAG &DAG) { +static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, + SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // @@ -27772,8 +30483,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); @@ -27797,11 +30508,11 @@ static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. - if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) + if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. @@ -27822,18 +30533,18 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. - if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { + if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); EVT LdVT = Ld->getValueType(0); - // This transformation is not supported if the result type is f16 - if (VT == MVT::f16) + // This transformation is not supported if the result type is f16 or f128. + if (VT == MVT::f16 || VT == MVT::f128) return SDValue(); if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !Subtarget->is64Bit() && LdVT == MVT::i64) { - SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( + !Subtarget.is64Bit() && LdVT == MVT::i64) { + SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; @@ -27843,8 +30554,8 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS -static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, - X86TargetLowering::DAGCombinerInfo &DCI) { +static SDValue combineADC(SDNode *N, SelectionDAG &DAG, + X86TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. @@ -27868,10 +30579,10 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// fold (add Y, (sete X, 0)) -> adc 0, Y -// (add Y, (setne X, 0)) -> sbb -1, Y -// (sub (sete X, 0), Y) -> sbb 0, Y -// (sub (setne X, 0), Y) -> adc -1, Y +/// fold (add Y, (sete X, 0)) -> adc 0, Y +/// (add Y, (setne X, 0)) -> sbb -1, Y +/// (sub (sete X, 0), Y) -> sbb 0, Y +/// (sub (setne X, 0), Y) -> adc -1, Y static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); @@ -27909,24 +30620,163 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } -/// PerformADDCombine - Do target-specific dag combines on integer adds. -static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!VT.isVector() || !VT.isSimple() || + !(VT.getVectorElementType() == MVT::i32)) + return SDValue(); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + + // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + if (VT.getSizeInBits() / 4 > RegSize) + return SDValue(); + + // Detect the following pattern: + // + // 1: %2 = zext <N x i8> %0 to <N x i32> + // 2: %3 = zext <N x i8> %1 to <N x i32> + // 3: %4 = sub nsw <N x i32> %2, %3 + // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] + // 5: %6 = sub nsw <N x i32> zeroinitializer, %4 + // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 + // 7: %8 = add nsw <N x i32> %7, %vec.phi + // + // The last instruction must be a reduction add. The instructions 3-6 forms an + // ABSDIFF pattern. + + // The two operands of reduction add are from PHI and a select-op as in line 7 + // above. + SDValue SelectOp, Phi; + if (Op0.getOpcode() == ISD::VSELECT) { + SelectOp = Op0; + Phi = Op1; + } else if (Op1.getOpcode() == ISD::VSELECT) { + SelectOp = Op1; + Phi = Op0; + } else + return SDValue(); + + // Check the condition of the select instruction is greater-than. + SDValue SetCC = SelectOp->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC) + return SDValue(); + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); + if (CC != ISD::SETGT) + return SDValue(); + + Op0 = SelectOp->getOperand(1); + Op1 = SelectOp->getOperand(2); + + // The second operand of SelectOp Op1 is the negation of the first operand + // Op0, which is implemented as 0 - Op0. + if (!(Op1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) && + Op1.getOperand(1) == Op0)) + return SDValue(); + + // The first operand of SetCC is the first operand of SelectOp, which is the + // difference between two input vectors. + if (SetCC.getOperand(0) != Op0) + return SDValue(); + + // The second operand of > comparison can be either -1 or 0. + if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) + return SDValue(); + + // The first operand of SelectOp is the difference between two input vectors. + if (Op0.getOpcode() != ISD::SUB) + return SDValue(); + + Op1 = Op0.getOperand(1); + Op0 = Op0.getOperand(0); + + // Check if the operands of the diff are zero-extended from vectors of i8. + if (Op0.getOpcode() != ISD::ZERO_EXTEND || + Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || + Op1.getOpcode() != ISD::ZERO_EXTEND || + Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) + return SDValue(); + + // SAD pattern detected. Now build a SAD instruction and an addition for + // reduction. Note that the number of elments of the result of SAD is less + // than the number of elements of its input. Therefore, we could only update + // part of elements in the reduction vector. + + // Legalize the type of the inputs of PSADBW. + EVT InVT = Op0.getOperand(0).getValueType(); + if (InVT.getSizeInBits() <= 128) + RegSize = 128; + else if (InVT.getSizeInBits() <= 256) + RegSize = 256; + + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT)); + Ops[0] = Op0.getOperand(0); + MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + Ops[0] = Op1.getOperand(0); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + + // The output of PSADBW is a vector of i64. + MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); + SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1); + + // We need to turn the vector of i64 into a vector of i32. + // If the reduction vector is at least as wide as the psadbw result, just + // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero + // anyway. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + if (VT.getSizeInBits() >= ResVT.getSizeInBits()) + Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); + else + Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); + + if (VT.getSizeInBits() > ResVT.getSizeInBits()) { + // Update part of elements of the reduction vector. This is done by first + // extracting a sub-vector from it, updating this sub-vector, and inserting + // it back. + SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi, + DAG.getIntPtrConstant(0, DL)); + SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, + DAG.getIntPtrConstant(0, DL)); + } else + return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); +} + +static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + if (Flags->hasVectorReduction()) { + if (SDValue Sad = detectSADPattern(N, DAG, Subtarget)) + return Sad; + } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } -static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue combineSub(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -27950,30 +30800,44 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); - if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } -/// performVZEXTCombine - Performs build vector combines -static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue combineVZext(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); MVT VT = N->getSimpleValueType(0); + MVT SVT = VT.getVectorElementType(); SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); - // (vzext (bitcast (vzext (x)) -> (vzext x) - SDValue V = Op; - while (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); + // Perform any constant folding. + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + SmallVector<SDValue, 4> Vals; + for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + SDValue OpElt = Op.getOperand(i); + if (OpElt.getOpcode() == ISD::UNDEF) { + Vals.push_back(DAG.getUNDEF(SVT)); + continue; + } + APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); + assert(Cst.getBitWidth() == OpEltVT.getSizeInBits()); + Cst = Cst.zextOrTrunc(SVT.getSizeInBits()); + Vals.push_back(DAG.getConstant(Cst, DL, SVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals); + } + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue V = peekThroughBitcasts(Op); if (V != Op && V.getOpcode() == X86ISD::VZEXT) { MVT InnerVT = V.getSimpleValueType(); MVT InnerEltVT = InnerVT.getVectorElementType(); @@ -28022,61 +30886,111 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Canonicalize (LSUB p, 1) -> (LADD p, -1). +static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Chain = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + MVT VT = RHS.getSimpleValueType(); + SDLoc DL(N); + + auto *C = dyn_cast<ConstantSDNode>(RHS); + if (!C || C->getZExtValue() != 1) + return SDValue(); + + RHS = DAG.getConstant(-1, DL, VT); + MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); + return DAG.getMemIntrinsicNode(X86ISD::LADD, DL, + DAG.getVTList(MVT::i32, MVT::Other), + {Chain, LHS, RHS}, VT, MMO); +} + +// TEST (AND a, b) ,(AND a, b) -> TEST a, b +static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (Op0 != Op1 || Op1->getOpcode() != ISD::AND) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + return DAG.getNode(X86ISD::TESTM, DL, VT, + Op0->getOperand(0), Op0->getOperand(1)); +} + +static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = N->getSimpleValueType(0); + SDLoc DL(N); + + if (N->getOperand(0) == N->getOperand(1)) { + if (N->getOpcode() == X86ISD::PCMPEQ) + return getOnesVector(VT, Subtarget, DAG, DL); + if (N->getOpcode() == X86ISD::PCMPGT) + return getZeroVector(VT, Subtarget, DAG, DL); + } + + return SDValue(); +} + + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; - case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); + case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: - case X86ISD::SHRUNKBLEND: - return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); - case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); - case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); - case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); - case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); - case ISD::MUL: return PerformMulCombine(N, DAG, DCI); + case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); + case ISD::ADD: return combineAdd(N, DAG, Subtarget); + case ISD::SUB: return combineSub(N, DAG, Subtarget); + case X86ISD::ADC: return combineADC(N, DAG, DCI); + case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); - case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); - case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); - case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); - case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); - case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); - case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); - case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); - case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); - case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); - case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); - case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); - case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); - case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); + case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); + case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); + case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); + case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); + case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); + case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); + case ISD::STORE: return combineStore(N, DAG, Subtarget); + case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); + case ISD::FADD: + case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); + case ISD::FNEG: return combineFneg(N, DAG, Subtarget); + case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::FXOR: - case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); + case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); case X86ISD::FMIN: - case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); + case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: - case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, - Subtarget); - case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); - case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); - case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); + case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); + case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); + case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); + case X86ISD::BT: return combineBT(N, DAG, DCI); + case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG); case ISD::ANY_EXTEND: - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); - case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::SIGN_EXTEND_INREG: - return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); - case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); - case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); - case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); - case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); + case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); + case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); + case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); + case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles + case X86ISD::INSERTPS: case X86ISD::PALIGNR: + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: @@ -28086,23 +31000,36 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VPPERM: + case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + case X86ISD::VPERMIL2: case X86ISD::VPERMILPI: + case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: - case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); - case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); + case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: - case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); + case ISD::MSCATTER: return combineGatherScatter(N, DAG); + case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget); + case X86ISD::TESTM: return combineTestM(N, DAG); + case X86ISD::PCMPEQ: + case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); } return SDValue(); } -/// isTypeDesirableForOp - Return true if the target has native support for -/// the specified value type and it is 'desirable' to use the type for the -/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 -/// instruction encodings are longer and some i16 instructions are slow. +/// Return true if the target has native support for the specified value type +/// and it is 'desirable' to use the type for the given node type. e.g. On x86 +/// i16 is legal, but undesirable since i16 instruction encodings are longer and +/// some i16 instructions are slow. bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; @@ -28140,9 +31067,9 @@ bool X86TargetLowering::hasCopyImplyingStackAdjustment( [](const MachineInstr &RI) { return RI.isCopy(); }); } -/// IsDesirableToPromoteOp - This method query the target whether it is -/// beneficial for dag combiner to promote the specified node. If true, it -/// should return the desired promotion type by reference. +/// This method query the target whether it is beneficial for dag combiner to +/// promote the specified node. If true, it should return the desired promotion +/// type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); if (VT != MVT::i16) @@ -28152,23 +31079,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { bool Commute = false; switch (Op.getOpcode()) { default: break; - case ISD::LOAD: { - LoadSDNode *LD = cast<LoadSDNode>(Op); - // If the non-extending load has a single use and it's not live out, then it - // might be folded. - if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& - Op.hasOneUse()*/) { - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) { - // The only case where we'd want to promote LOAD (rather then it being - // promoted as an operand is when it's only use is liveout. - if (UI->getOpcode() != ISD::CopyToReg) - return false; - } - } - Promote = true; - break; - } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: @@ -28250,7 +31160,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); - std::string AsmStr = IA->getAsmString(); + const std::string &AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) @@ -28323,8 +31233,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } -/// getConstraintType - Given a constraint letter, return the type of -/// constraint it is for this target. +/// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { @@ -28403,13 +31312,13 @@ TargetLowering::ConstraintWeight weight = CW_SpecificReg; break; case 'y': - if (type->isX86_MMXTy() && Subtarget->hasMMX()) + if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; case 'x': case 'Y': - if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || - ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256())) weight = CW_Register; break; case 'I': @@ -28471,25 +31380,25 @@ TargetLowering::ConstraintWeight return weight; } -/// LowerXConstraint - try to replace an X constraint, which matches anything, -/// with another that has more specific requirements based on the type of the -/// corresponding operand. +/// Try to replace an X constraint, which matches anything, with another that +/// has more specific requirements based on the type of the corresponding +/// operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasSSE2()) + if (Subtarget.hasSSE2()) return "Y"; - if (Subtarget->hasSSE1()) + if (Subtarget.hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } -/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops -/// vector. If it is invalid, don't add anything to Ops. +/// Lower the specified operand into the Ops vector. +/// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue>&Ops, @@ -28532,7 +31441,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'L': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || - (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { + (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), Op.getValueType()); break; @@ -28605,7 +31514,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. - if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) + if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) return; // If we are in non-pic codegen mode, we allow the address of a global (with @@ -28639,8 +31548,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. - if (isGlobalStubReference( - Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) + if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), @@ -28656,6 +31564,65 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } +/// Check if \p RC is a general purpose register class. +/// I.e., GR* or one of their variant. +static bool isGRClass(const TargetRegisterClass &RC) { + switch (RC.getID()) { + case X86::GR8RegClassID: + case X86::GR8_ABCD_LRegClassID: + case X86::GR8_ABCD_HRegClassID: + case X86::GR8_NOREXRegClassID: + case X86::GR16RegClassID: + case X86::GR16_ABCDRegClassID: + case X86::GR16_NOREXRegClassID: + case X86::GR32RegClassID: + case X86::GR32_ABCDRegClassID: + case X86::GR32_TCRegClassID: + case X86::GR32_NOREXRegClassID: + case X86::GR32_NOAXRegClassID: + case X86::GR32_NOSPRegClassID: + case X86::GR32_NOREX_NOSPRegClassID: + case X86::GR32_ADRegClassID: + case X86::GR64RegClassID: + case X86::GR64_ABCDRegClassID: + case X86::GR64_TCRegClassID: + case X86::GR64_TCW64RegClassID: + case X86::GR64_NOREXRegClassID: + case X86::GR64_NOSPRegClassID: + case X86::GR64_NOREX_NOSPRegClassID: + case X86::LOW32_ADDR_ACCESSRegClassID: + case X86::LOW32_ADDR_ACCESS_RBPRegClassID: + return true; + default: + return false; + } +} + +/// Check if \p RC is a vector register class. +/// I.e., FR* / VR* or one of their variant. +static bool isFRClass(const TargetRegisterClass &RC) { + switch (RC.getID()) { + case X86::FR32RegClassID: + case X86::FR32XRegClassID: + case X86::FR64RegClassID: + case X86::FR64XRegClassID: + case X86::FR128RegClassID: + case X86::VR64RegClassID: + case X86::VR128RegClassID: + case X86::VR128LRegClassID: + case X86::VR128HRegClassID: + case X86::VR128XRegClassID: + case X86::VR256RegClassID: + case X86::VR256LRegClassID: + case X86::VR256HRegClassID: + case X86::VR256XRegClassID: + case X86::VR512RegClassID: + return true; + default: + return false; + } +} + std::pair<unsigned, const TargetRegisterClass *> X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, @@ -28670,7 +31637,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. - if (Subtarget->is64Bit()) { + if (Subtarget.is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT == MVT::i16) @@ -28698,7 +31665,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); - if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS @@ -28706,7 +31673,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); - if (VT == MVT::i32 || !Subtarget->is64Bit()) + if (VT == MVT::i32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. @@ -28718,13 +31685,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::RFP64RegClass); return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. - if (!Subtarget->hasMMX()) break; + if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasSSE2()) break; + if (!Subtarget.hasSSE2()) break; // FALL THROUGH. case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed - if (!Subtarget->hasSSE1()) break; + if (!Subtarget.hasSSE1()) break; switch (VT.SimpleTy) { default: break; @@ -28817,8 +31784,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // return "eax". This should even work for things like getting 64bit integer // registers when given an f64 type. const TargetRegisterClass *Class = Res.second; - if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || - Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { + // The generic code will match the first register class that contains the + // given register. Thus, based on the ordering of the tablegened file, + // the "plain" GR classes might not come first. + // Therefore, use a helper method. + if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); @@ -28834,11 +31804,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.first = 0; Res.second = nullptr; } - } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || - Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || - Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || - Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || - Class == &X86::VR512RegClass) { + } else if (isFRClass(*Class)) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -28907,7 +31873,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { } void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { - if (!Subtarget->is64Bit()) + if (!Subtarget.is64Bit()) return; // Update IsSplitCSR in X86MachineFunctionInfo. @@ -28919,12 +31885,12 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { void X86TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl<MachineBasicBlock *> &Exits) const { - const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { |