summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp12158
1 files changed, 7562 insertions, 4596 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index dd9966f9e1791..e547111959008 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -71,9 +71,10 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
- : TargetLowering(TM), Subtarget(&STI) {
- X86ScalarSSEf64 = Subtarget->hasSSE2();
- X86ScalarSSEf32 = Subtarget->hasSSE1();
+ : TargetLowering(TM), Subtarget(STI) {
+ bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+ X86ScalarSSEf64 = Subtarget.hasSSE2();
+ X86ScalarSSEf32 = Subtarget.hasSSE1();
MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
// Set up the TargetLowering object.
@@ -86,24 +87,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// For 64-bit, since we have so many registers, use the ILP scheduler.
// For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
- if (Subtarget->isAtom())
+ if (Subtarget.isAtom())
setSchedulingPreference(Sched::ILP);
- else if (Subtarget->is64Bit())
+ else if (Subtarget.is64Bit())
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
- if (Subtarget->hasSlowDivide32())
+ if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
- if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
+ if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
addBypassSlowDiv(64, 16);
}
- if (Subtarget->isTargetKnownWindowsMSVC()) {
+ if (Subtarget.isTargetKnownWindowsMSVC()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
@@ -117,11 +118,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
}
- if (Subtarget->isTargetDarwin()) {
+ if (Subtarget.isTargetDarwin()) {
// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
setUseUnderscoreSetJmp(false);
setUseUnderscoreLongJmp(false);
- } else if (Subtarget->isTargetWindowsGNU()) {
+ } else if (Subtarget.isTargetWindowsGNU()) {
// MS runtime is weird: it exports _setjmp, but longjmp!
setUseUnderscoreSetJmp(true);
setUseUnderscoreLongJmp(false);
@@ -134,7 +135,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
addRegisterClass(MVT::i32, &X86::GR32RegClass);
- if (Subtarget->is64Bit())
+ if (Subtarget.is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
for (MVT VT : MVT::integer_valuetypes())
@@ -164,14 +165,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
- if (Subtarget->is64Bit()) {
- if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+ if (Subtarget.is64Bit()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
// f32/f64 are legal, f80 is custom.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
else
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
- } else if (!Subtarget->useSoftFloat()) {
+ } else if (!Subtarget.useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
@@ -185,8 +186,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
- if (!Subtarget->useSoftFloat()) {
- // SSE has no i16 to fp conversion, only i32
+ if (!Subtarget.useSoftFloat()) {
+ // SSE has no i16 to fp conversion, only i32.
if (X86ScalarSSEf32) {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
// f32 and f64 cases are Legal, f80 case is not
@@ -205,7 +206,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
- if (!Subtarget->useSoftFloat()) {
+ if (!Subtarget.useSoftFloat()) {
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
@@ -231,8 +232,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
- if (Subtarget->is64Bit()) {
- if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+ if (Subtarget.is64Bit()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
@@ -240,9 +241,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
}
- } else if (!Subtarget->useSoftFloat()) {
+ } else if (!Subtarget.useSoftFloat()) {
// Since AVX is a superset of SSE3, only check for SSE here.
- if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
// Expand FP_TO_UINT into a select.
// FIXME: We would like to use a Custom expander here eventually to do
// the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -260,12 +261,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!X86ScalarSSEf64) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
}
- } else if (!Subtarget->is64Bit())
+ } else if (!Subtarget.is64Bit())
setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
// Scalar integer divide and remainder are lowered to use operations that
@@ -295,72 +296,43 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
- setOperationAction(ISD::BR_CC , MVT::f32, Expand);
- setOperationAction(ISD::BR_CC , MVT::f64, Expand);
- setOperationAction(ISD::BR_CC , MVT::f80, Expand);
- setOperationAction(ISD::BR_CC , MVT::f128, Expand);
- setOperationAction(ISD::BR_CC , MVT::i8, Expand);
- setOperationAction(ISD::BR_CC , MVT::i16, Expand);
- setOperationAction(ISD::BR_CC , MVT::i32, Expand);
- setOperationAction(ISD::BR_CC , MVT::i64, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::f128, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
- setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
- if (Subtarget->is64Bit())
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
+ MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ }
+ if (Subtarget.is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
- if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
- // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
- // is. We should promote the value to 64-bits to solve this.
- // This is what the CRT headers do - `fmodf` is an inline header
- // function casting to f64 and calling `fmod`.
- setOperationAction(ISD::FREM , MVT::f32 , Promote);
- } else {
- setOperationAction(ISD::FREM , MVT::f32 , Expand);
- }
-
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
- setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
- AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
- AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
- if (Subtarget->hasBMI()) {
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
- if (Subtarget->is64Bit())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
- } else {
+ setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ if (!Subtarget.hasBMI()) {
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
- if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
+ if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
+ }
}
- if (Subtarget->hasLZCNT()) {
+ if (Subtarget.hasLZCNT()) {
// When promoting the i8 variants, force them to i32 for a shorter
// encoding.
- setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
- AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
- AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
- if (Subtarget->is64Bit())
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+ setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
@@ -368,7 +340,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
}
@@ -377,7 +349,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -395,45 +367,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
- if (Subtarget->hasPOPCNT()) {
+ if (Subtarget.hasPOPCNT()) {
setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
- if (Subtarget->is64Bit())
+ if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
- if (!Subtarget->hasMOVBE())
+ if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
// These should be promoted to a larger select which is supported.
setOperationAction(ISD::SELECT , MVT::i1 , Promote);
// X86 wants to expand cmov itself.
- setOperationAction(ISD::SELECT , MVT::i8 , Custom);
- setOperationAction(ISD::SELECT , MVT::i16 , Custom);
- setOperationAction(ISD::SELECT , MVT::i32 , Custom);
- setOperationAction(ISD::SELECT , MVT::f32 , Custom);
- setOperationAction(ISD::SELECT , MVT::f64 , Custom);
- setOperationAction(ISD::SELECT , MVT::f80 , Custom);
- setOperationAction(ISD::SELECT , MVT::f128 , Custom);
- setOperationAction(ISD::SETCC , MVT::i8 , Custom);
- setOperationAction(ISD::SETCC , MVT::i16 , Custom);
- setOperationAction(ISD::SETCC , MVT::i32 , Custom);
- setOperationAction(ISD::SETCC , MVT::f32 , Custom);
- setOperationAction(ISD::SETCC , MVT::f64 , Custom);
- setOperationAction(ISD::SETCC , MVT::f80 , Custom);
- setOperationAction(ISD::SETCC , MVT::f128 , Custom);
- setOperationAction(ISD::SETCCE , MVT::i8 , Custom);
- setOperationAction(ISD::SETCCE , MVT::i16 , Custom);
- setOperationAction(ISD::SETCCE , MVT::i32 , Custom);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::SELECT , MVT::i64 , Custom);
- setOperationAction(ISD::SETCC , MVT::i64 , Custom);
- setOperationAction(ISD::SETCCE , MVT::i64 , Custom);
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SETCCE, VT, Custom);
}
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
@@ -444,34 +405,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// LLVM/Clang supports zero-cost DWARF exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+ setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
// Darwin ABI issue.
- setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
- setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
- setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
- setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
- if (Subtarget->is64Bit())
- setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
- setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
- setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
- setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
- setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
- setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
- setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::ConstantPool , VT, Custom);
+ setOperationAction(ISD::JumpTable , VT, Custom);
+ setOperationAction(ISD::GlobalAddress , VT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
+ setOperationAction(ISD::ExternalSymbol , VT, Custom);
+ setOperationAction(ISD::BlockAddress , VT, Custom);
}
// 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
- setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
- setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
- setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
- setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
- setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
+ for (auto VT : { MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::SHL_PARTS, VT, Custom);
+ setOperationAction(ISD::SRA_PARTS, VT, Custom);
+ setOperationAction(ISD::SRL_PARTS, VT, Custom);
}
- if (Subtarget->hasSSE1())
+ if (Subtarget.hasSSE1())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
@@ -480,16 +438,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
- if (Subtarget->hasCmpxchg16b()) {
+ if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
// FIXME - use subtarget debug flags
- if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
- !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
+ if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
+ !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
+ TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
@@ -505,14 +468,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::VAARG , MVT::Other, Custom);
- setOperationAction(ISD::VACOPY , MVT::Other, Custom);
- } else {
- // TargetInfo::CharPtrBuiltinVaList
- setOperationAction(ISD::VAARG , MVT::Other, Expand);
- setOperationAction(ISD::VACOPY , MVT::Other, Expand);
- }
+ bool Is64Bit = Subtarget.is64Bit();
+ setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -523,41 +481,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
- if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
+ if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
addRegisterClass(MVT::f64, &X86::FR64RegClass);
- // Use ANDPD to simulate FABS.
- setOperationAction(ISD::FABS , MVT::f64, Custom);
- setOperationAction(ISD::FABS , MVT::f32, Custom);
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS, VT, Custom);
- // Use XORP to simulate FNEG.
- setOperationAction(ISD::FNEG , MVT::f64, Custom);
- setOperationAction(ISD::FNEG , MVT::f32, Custom);
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG, VT, Custom);
- // Use ANDPD and ORPD to simulate FCOPYSIGN.
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
- // Lower this to FGETSIGNx86 plus an AND.
+ // Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
- // We don't support sin/cos/fmod
- setOperationAction(ISD::FSIN , MVT::f64, Expand);
- setOperationAction(ISD::FCOS , MVT::f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
- setOperationAction(ISD::FSIN , MVT::f32, Expand);
- setOperationAction(ISD::FCOS , MVT::f32, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-
// Expand FP immediates into loads from the stack, except for the special
// cases we handle.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
addLegalFPImmediate(APFloat(+0.0f)); // xorps
- } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
+ } else if (UseX87 && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -592,24 +546,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FCOS , MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
}
- } else if (!Subtarget->useSoftFloat()) {
+ } else if (UseX87) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
addRegisterClass(MVT::f32, &X86::RFP32RegClass);
- setOperationAction(ISD::UNDEF, MVT::f64, Expand);
- setOperationAction(ISD::UNDEF, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ for (auto VT : { MVT::f32, MVT::f64 }) {
+ setOperationAction(ISD::UNDEF, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
- if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f64, Expand);
- setOperationAction(ISD::FSIN , MVT::f32, Expand);
- setOperationAction(ISD::FCOS , MVT::f64, Expand);
- setOperationAction(ISD::FCOS , MVT::f32, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+ if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ }
}
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
@@ -626,8 +577,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f32, Expand);
// Long double always uses X87, except f128 in MMX.
- if (!Subtarget->useSoftFloat()) {
- if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+ if (UseX87) {
+ if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
addRegisterClass(MVT::f128, &X86::FR128RegClass);
ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
setOperationAction(ISD::FABS , MVT::f128, Custom);
@@ -680,38 +631,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+ // Some FP actions are always expanded for vector types.
+ for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
+ MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ }
+
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::vector_valuetypes()) {
- setOperationAction(ISD::ADD , VT, Expand);
- setOperationAction(ISD::SUB , VT, Expand);
- setOperationAction(ISD::FADD, VT, Expand);
- setOperationAction(ISD::FNEG, VT, Expand);
- setOperationAction(ISD::FSUB, VT, Expand);
- setOperationAction(ISD::MUL , VT, Expand);
- setOperationAction(ISD::FMUL, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
- setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
- setOperationAction(ISD::LOAD, VT, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
- setOperationAction(ISD::FABS, VT, Expand);
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FMA, VT, Expand);
- setOperationAction(ISD::FPOWI, VT, Expand);
- setOperationAction(ISD::FSQRT, VT, Expand);
- setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -723,24 +672,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
- setOperationAction(ISD::SHL, VT, Expand);
- setOperationAction(ISD::SRA, VT, Expand);
- setOperationAction(ISD::SRL, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::SETCC, VT, Expand);
- setOperationAction(ISD::FLOG, VT, Expand);
- setOperationAction(ISD::FLOG2, VT, Expand);
- setOperationAction(ISD::FLOG10, VT, Expand);
- setOperationAction(ISD::FEXP, VT, Expand);
- setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
@@ -750,7 +688,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
- setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
@@ -774,35 +711,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
- if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
- // MMX-sized vectors (other than x86mmx) are expected to be expanded
- // into smaller operations.
- for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
- setOperationAction(ISD::MULHS, MMXTy, Expand);
- setOperationAction(ISD::AND, MMXTy, Expand);
- setOperationAction(ISD::OR, MMXTy, Expand);
- setOperationAction(ISD::XOR, MMXTy, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand);
- setOperationAction(ISD::SELECT, MMXTy, Expand);
- setOperationAction(ISD::BITCAST, MMXTy, Expand);
- }
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
-
- if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
- setOperationAction(ISD::FADD, MVT::v4f32, Legal);
- setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
- setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
- setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
- setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
@@ -811,7 +729,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
}
- if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
@@ -821,27 +739,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
- setOperationAction(ISD::ADD, MVT::v16i8, Legal);
- setOperationAction(ISD::ADD, MVT::v8i16, Legal);
- setOperationAction(ISD::ADD, MVT::v4i32, Legal);
- setOperationAction(ISD::ADD, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
- setOperationAction(ISD::SUB, MVT::v16i8, Legal);
- setOperationAction(ISD::SUB, MVT::v8i16, Legal);
- setOperationAction(ISD::SUB, MVT::v4i32, Legal);
- setOperationAction(ISD::SUB, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
- setOperationAction(ISD::FADD, MVT::v2f64, Legal);
- setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
- setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
- setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
- setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
@@ -870,10 +777,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
// ISD::CTTZ v2i64 - scalarization is faster.
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
- // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
@@ -899,37 +802,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
}
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+ for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+
+ if (VT == MVT::v2i64 && !Subtarget.is64Bit())
+ continue;
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, MVT::v2i64);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, MVT::v2i64);
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
- setOperationAction(ISD::SELECT, VT, Promote);
- AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
}
// Custom lower v2i64 and v2f64 selects.
- setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
- setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
@@ -942,7 +836,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
// As there is no 64-bit GPR available, we need build a special custom
// sequence to convert from v2i32 to v2f32.
- if (!Subtarget->is64Bit())
+ if (!Subtarget.is64Bit())
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
@@ -954,9 +848,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+ for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ // ISD::CTLZ v4i32 - scalarization is faster.
+ // ISD::CTLZ v2i64 - scalarization is faster.
}
- if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::FCEIL, RoundedTy, Legal);
@@ -1004,66 +924,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
- // i8 and i16 vectors are custom because the source register and source
- // source memory operand types are not the same width. f32 vectors are
- // custom since the immediate controlling the insert encodes additional
- // information.
+ // i8 vectors are custom because the source register and source
+ // source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
-
- // FIXME: these should be Legal, but that's only for the case where
- // the index is constant. For now custom expand to deal with that.
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
- }
}
- if (Subtarget->hasSSE2()) {
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
-
- setOperationAction(ISD::SRL, MVT::v8i16, Custom);
- setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+ if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::ROTL, VT, Custom);
- setOperationAction(ISD::SHL, MVT::v8i16, Custom);
- setOperationAction(ISD::SHL, MVT::v16i8, Custom);
-
- setOperationAction(ISD::SRA, MVT::v8i16, Custom);
- setOperationAction(ISD::SRA, MVT::v16i8, Custom);
-
- // In the customized shift lowering, the legal cases in AVX2 will be
- // recognized.
- setOperationAction(ISD::SRL, MVT::v2i64, Custom);
- setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+ // XOP can efficiently perform BITREVERSE with VPPERM.
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
- setOperationAction(ISD::SHL, MVT::v2i64, Custom);
- setOperationAction(ISD::SHL, MVT::v4i32, Custom);
-
- setOperationAction(ISD::SRA, MVT::v2i64, Custom);
- setOperationAction(ISD::SRA, MVT::v4i32, Custom);
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
}
- if (Subtarget->hasXOP()) {
- setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
- setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
- setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
- setOperationAction(ISD::ROTL, MVT::v2i64, Custom);
- setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
- setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
- setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
- setOperationAction(ISD::ROTL, MVT::v4i64, Custom);
- }
+ if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+ bool HasInt256 = Subtarget.hasInt256();
- if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
@@ -1071,35 +953,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
- setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
- setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
- setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
-
- setOperationAction(ISD::FADD, MVT::v8f32, Legal);
- setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
- setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
- setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
- setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
- setOperationAction(ISD::FABS, MVT::v8f32, Custom);
-
- setOperationAction(ISD::FADD, MVT::v4f64, Legal);
- setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
- setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
- setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
- setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
- setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
- setOperationAction(ISD::FABS, MVT::v4f64, Custom);
+ for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ }
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
@@ -1117,14 +979,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
- setOperationAction(ISD::SRL, MVT::v16i16, Custom);
- setOperationAction(ISD::SRL, MVT::v32i8, Custom);
-
- setOperationAction(ISD::SHL, MVT::v16i16, Custom);
- setOperationAction(ISD::SHL, MVT::v32i8, Custom);
-
- setOperationAction(ISD::SRA, MVT::v16i16, Custom);
- setOperationAction(ISD::SRA, MVT::v32i8, Custom);
+ for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
@@ -1147,63 +1006,57 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ }
+
+ // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
+ // as we end up splitting the 256-bit vectors.
+ for (auto VT : { MVT::v32i8, MVT::v16i16 })
+ setOperationAction(ISD::CTLZ, VT, Custom);
+
+ if (HasInt256)
+ for (auto VT : { MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::CTPOP, MVT::v32i8, Custom);
- setOperationAction(ISD::CTPOP, MVT::v16i16, Custom);
- setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
-
- setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
- setOperationAction(ISD::CTTZ, MVT::v16i16, Custom);
- setOperationAction(ISD::CTTZ, MVT::v8i32, Custom);
- setOperationAction(ISD::CTTZ, MVT::v4i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
-
- if (Subtarget->hasAnyFMA()) {
- setOperationAction(ISD::FMA, MVT::v8f32, Legal);
- setOperationAction(ISD::FMA, MVT::v4f64, Legal);
- setOperationAction(ISD::FMA, MVT::v4f32, Legal);
- setOperationAction(ISD::FMA, MVT::v2f64, Legal);
- setOperationAction(ISD::FMA, MVT::f32, Legal);
- setOperationAction(ISD::FMA, MVT::f64, Legal);
- }
-
- if (Subtarget->hasInt256()) {
- setOperationAction(ISD::ADD, MVT::v4i64, Legal);
- setOperationAction(ISD::ADD, MVT::v8i32, Legal);
- setOperationAction(ISD::ADD, MVT::v16i16, Legal);
- setOperationAction(ISD::ADD, MVT::v32i8, Legal);
-
- setOperationAction(ISD::SUB, MVT::v4i64, Legal);
- setOperationAction(ISD::SUB, MVT::v8i32, Legal);
- setOperationAction(ISD::SUB, MVT::v16i16, Legal);
- setOperationAction(ISD::SUB, MVT::v32i8, Legal);
-
- setOperationAction(ISD::MUL, MVT::v4i64, Custom);
- setOperationAction(ISD::MUL, MVT::v8i32, Legal);
- setOperationAction(ISD::MUL, MVT::v16i16, Legal);
- setOperationAction(ISD::MUL, MVT::v32i8, Custom);
-
- setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
- setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
-
- setOperationAction(ISD::SMAX, MVT::v32i8, Legal);
- setOperationAction(ISD::SMAX, MVT::v16i16, Legal);
- setOperationAction(ISD::SMAX, MVT::v8i32, Legal);
- setOperationAction(ISD::UMAX, MVT::v32i8, Legal);
- setOperationAction(ISD::UMAX, MVT::v16i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v8i32, Legal);
- setOperationAction(ISD::SMIN, MVT::v32i8, Legal);
- setOperationAction(ISD::SMIN, MVT::v16i16, Legal);
- setOperationAction(ISD::SMIN, MVT::v8i32, Legal);
- setOperationAction(ISD::UMIN, MVT::v32i8, Legal);
- setOperationAction(ISD::UMIN, MVT::v16i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v8i32, Legal);
+ if (Subtarget.hasAnyFMA()) {
+ for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
+ MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::FMA, VT, Legal);
+ }
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
+ }
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+ }
+
+ if (HasInt256) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
@@ -1223,62 +1076,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
- } else {
- setOperationAction(ISD::ADD, MVT::v4i64, Custom);
- setOperationAction(ISD::ADD, MVT::v8i32, Custom);
- setOperationAction(ISD::ADD, MVT::v16i16, Custom);
- setOperationAction(ISD::ADD, MVT::v32i8, Custom);
-
- setOperationAction(ISD::SUB, MVT::v4i64, Custom);
- setOperationAction(ISD::SUB, MVT::v8i32, Custom);
- setOperationAction(ISD::SUB, MVT::v16i16, Custom);
- setOperationAction(ISD::SUB, MVT::v32i8, Custom);
-
- setOperationAction(ISD::MUL, MVT::v4i64, Custom);
- setOperationAction(ISD::MUL, MVT::v8i32, Custom);
- setOperationAction(ISD::MUL, MVT::v16i16, Custom);
- setOperationAction(ISD::MUL, MVT::v32i8, Custom);
-
- setOperationAction(ISD::SMAX, MVT::v32i8, Custom);
- setOperationAction(ISD::SMAX, MVT::v16i16, Custom);
- setOperationAction(ISD::SMAX, MVT::v8i32, Custom);
- setOperationAction(ISD::UMAX, MVT::v32i8, Custom);
- setOperationAction(ISD::UMAX, MVT::v16i16, Custom);
- setOperationAction(ISD::UMAX, MVT::v8i32, Custom);
- setOperationAction(ISD::SMIN, MVT::v32i8, Custom);
- setOperationAction(ISD::SMIN, MVT::v16i16, Custom);
- setOperationAction(ISD::SMIN, MVT::v8i32, Custom);
- setOperationAction(ISD::UMIN, MVT::v32i8, Custom);
- setOperationAction(ISD::UMIN, MVT::v16i16, Custom);
- setOperationAction(ISD::UMIN, MVT::v8i32, Custom);
}
// In the customized shift lowering, the legal cases in AVX2 will be
// recognized.
- setOperationAction(ISD::SRL, MVT::v4i64, Custom);
- setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+ for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ }
- setOperationAction(ISD::SHL, MVT::v4i64, Custom);
- setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
- setOperationAction(ISD::SRA, MVT::v4i64, Custom);
- setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v4f32, MVT::v2f64 }) {
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ }
// Custom lower several nodes for 256-bit types.
- for (MVT VT : MVT::vector_valuetypes()) {
- if (VT.getScalarSizeInBits() >= 32) {
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
- }
- // Extract subvector is special because the value type
- // (result) is 128-bit but the source is 256-bit wide.
- if (VT.is128BitVector()) {
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
- }
- // Do not attempt to custom lower other non-256-bit vectors
- if (!VT.is256BitVector())
- continue;
-
+ for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1289,25 +1112,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
- if (Subtarget->hasInt256())
+ if (HasInt256)
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, MVT::v4i64);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, MVT::v4i64);
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
- setOperationAction(ISD::LOAD, VT, Promote);
- AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
- setOperationAction(ISD::SELECT, VT, Promote);
- AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
}
}
- if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
@@ -1320,19 +1138,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
-
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ }
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::SETCCE, MVT::i1, Custom);
@@ -1343,29 +1156,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, MVT::i1, Custom);
setOperationAction(ISD::ADD, MVT::i1, Custom);
setOperationAction(ISD::MUL, MVT::i1, Custom);
- setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
- setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
- setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
- setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
- setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
-
- setOperationAction(ISD::FADD, MVT::v16f32, Legal);
- setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
- setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
- setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
- setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
- setOperationAction(ISD::FABS, MVT::v16f32, Custom);
-
- setOperationAction(ISD::FADD, MVT::v8f64, Legal);
- setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
- setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
- setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
- setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
- setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
- setOperationAction(ISD::FABS, MVT::v8f64, Custom);
- setOperationAction(ISD::FMA, MVT::v8f64, Legal);
- setOperationAction(ISD::FMA, MVT::v16f32, Legal);
+
+ for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
+ MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
+ MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
+ setTruncStoreAction(VT, MaskVT, Custom);
+ }
+
+ for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Legal);
+ }
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
@@ -1389,7 +1195,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
- if (Subtarget->hasVLX()){
+ if (Subtarget.hasVLX()){
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
@@ -1412,15 +1218,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
- if (Subtarget->hasDQI()) {
- setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
-
+ setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
+ if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
- if (Subtarget->hasVLX()) {
+ if (Subtarget.hasVLX()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
@@ -1431,7 +1236,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
}
}
- if (Subtarget->hasVLX()) {
+ if (Subtarget.hasVLX()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
@@ -1440,7 +1245,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+
+ // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
}
+
setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
@@ -1453,20 +1273,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
- if (Subtarget->hasDQI()) {
+ if (Subtarget.hasDQI()) {
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
}
- setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
+ for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ }
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
@@ -1501,139 +1318,115 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
- setOperationAction(ISD::ADD, MVT::v8i64, Legal);
- setOperationAction(ISD::ADD, MVT::v16i32, Legal);
-
- setOperationAction(ISD::SUB, MVT::v8i64, Legal);
- setOperationAction(ISD::SUB, MVT::v16i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v16i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v8i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v16i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v8i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v16i1, Expand);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
- setOperationAction(ISD::SRL, MVT::v8i64, Custom);
- setOperationAction(ISD::SRL, MVT::v16i32, Custom);
-
- setOperationAction(ISD::SHL, MVT::v8i64, Custom);
- setOperationAction(ISD::SHL, MVT::v16i32, Custom);
-
- setOperationAction(ISD::SRA, MVT::v8i64, Custom);
- setOperationAction(ISD::SRA, MVT::v16i32, Custom);
-
- setOperationAction(ISD::AND, MVT::v8i64, Legal);
- setOperationAction(ISD::OR, MVT::v8i64, Legal);
- setOperationAction(ISD::XOR, MVT::v8i64, Legal);
- setOperationAction(ISD::AND, MVT::v16i32, Legal);
- setOperationAction(ISD::OR, MVT::v16i32, Legal);
- setOperationAction(ISD::XOR, MVT::v16i32, Legal);
+ for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::AND, VT, Legal);
+ setOperationAction(ISD::OR, VT, Legal);
+ setOperationAction(ISD::XOR, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ }
- if (Subtarget->hasCDI()) {
+ if (Subtarget.hasCDI()) {
setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
- if (Subtarget->hasVLX()) {
+ if (Subtarget.hasVLX()) {
setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
-
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
} else {
setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
}
- } // Subtarget->hasCDI()
- if (Subtarget->hasDQI()) {
- setOperationAction(ISD::MUL, MVT::v2i64, Legal);
- setOperationAction(ISD::MUL, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ } // Subtarget.hasCDI()
+
+ if (Subtarget.hasDQI()) {
+ if (Subtarget.hasVLX()) {
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
+ }
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
// Custom lower several nodes.
- for (MVT VT : MVT::vector_valuetypes()) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (EltSize == 1) {
- setOperationAction(ISD::AND, VT, Legal);
- setOperationAction(ISD::OR, VT, Legal);
- setOperationAction(ISD::XOR, VT, Legal);
- }
- if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
- }
- // Extract subvector is special because the value type
- // (result) is 256/128-bit but the source is 512-bit wide.
- if (VT.is128BitVector() || VT.is256BitVector()) {
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
- }
- if (VT.getVectorElementType() == MVT::i1)
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
-
- // Do not attempt to custom lower other non-512-bit vectors
- if (!VT.is512BitVector())
- continue;
-
- if (EltSize >= 32) {
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::MGATHER, VT, Legal);
- setOperationAction(ISD::MSCATTER, VT, Custom);
- }
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+ // Extract subvector is special because the value type
+ // (result) is 256-bit but the source is 512-bit wide.
+ // 128-bit was made Custom under AVX1.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
+ MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
}
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
- setOperationAction(ISD::SELECT, VT, Promote);
- AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
}
}// has AVX-512
- if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
- setOperationAction(ISD::LOAD, MVT::v32i16, Legal);
- setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v32i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v64i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v32i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v64i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v32i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v64i1, Expand);
+
setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
- setOperationAction(ISD::ADD, MVT::v32i16, Legal);
- setOperationAction(ISD::ADD, MVT::v64i8, Legal);
- setOperationAction(ISD::SUB, MVT::v32i16, Legal);
- setOperationAction(ISD::SUB, MVT::v64i8, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
@@ -1646,12 +1439,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
@@ -1667,6 +1463,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
@@ -1679,36 +1480,59 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
- if (Subtarget->hasVLX())
+ if (Subtarget.hasVLX())
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
- if (Subtarget->hasCDI()) {
+ LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
+ for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Action);
+ setOperationAction(ISD::MSTORE, VT, Action);
+ }
+
+ if (Subtarget.hasCDI()) {
setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand);
}
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
-
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, MVT::v8i64);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, MVT::v8i64);
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+
+ setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
+ setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
+ }
+
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+ if (Subtarget.hasVLX()) {
+ // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+ setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
+ }
}
}
- if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+ setOperationAction(ISD::ADD, MVT::v2i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v4i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v2i1, Expand);
+ setOperationAction(ISD::SUB, MVT::v4i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v2i1, Expand);
+ setOperationAction(ISD::MUL, MVT::v4i1, Expand);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
@@ -1721,31 +1545,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i1, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v4i1, Expand);
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
+ setOperationAction(ISD::AND, VT, Legal);
+ setOperationAction(ISD::OR, VT, Legal);
+ setOperationAction(ISD::XOR, VT, Legal);
+ }
- setOperationAction(ISD::AND, MVT::v8i32, Legal);
- setOperationAction(ISD::OR, MVT::v8i32, Legal);
- setOperationAction(ISD::XOR, MVT::v8i32, Legal);
- setOperationAction(ISD::AND, MVT::v4i32, Legal);
- setOperationAction(ISD::OR, MVT::v4i32, Legal);
- setOperationAction(ISD::XOR, MVT::v4i32, Legal);
- setOperationAction(ISD::SRA, MVT::v2i64, Custom);
- setOperationAction(ISD::SRA, MVT::v4i64, Custom);
-
- setOperationAction(ISD::SMAX, MVT::v2i64, Legal);
- setOperationAction(ISD::SMAX, MVT::v4i64, Legal);
- setOperationAction(ISD::UMAX, MVT::v2i64, Legal);
- setOperationAction(ISD::UMAX, MVT::v4i64, Legal);
- setOperationAction(ISD::SMIN, MVT::v2i64, Legal);
- setOperationAction(ISD::SMIN, MVT::v4i64, Legal);
- setOperationAction(ISD::UMIN, MVT::v2i64, Legal);
- setOperationAction(ISD::UMIN, MVT::v4i64, Legal);
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ }
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- if (!Subtarget->is64Bit()) {
+ if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
}
@@ -1757,7 +1578,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
- if (VT == MVT::i64 && !Subtarget->is64Bit())
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
// Add/Sub/Mul with overflow operations are custom lowered.
setOperationAction(ISD::SADDO, VT, Custom);
@@ -1768,7 +1589,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMULO, VT, Custom);
}
- if (!Subtarget->is64Bit()) {
+ if (!Subtarget.is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
@@ -1776,10 +1597,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Combine sin / cos into one node or libcall if possible.
- if (Subtarget->hasSinCos()) {
+ if (Subtarget.hasSinCos()) {
setLibcallName(RTLIB::SINCOS_F32, "sincosf");
setLibcallName(RTLIB::SINCOS_F64, "sincos");
- if (Subtarget->isTargetDarwin()) {
+ if (Subtarget.isTargetDarwin()) {
// For MacOSX, we don't want the normal expansion of a libcall to sincos.
// We want to issue a libcall to __sincos_stret to avoid memory traffic.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -1787,7 +1608,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (Subtarget->isTargetWin64()) {
+ if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
setOperationAction(ISD::UDIV, MVT::i128, Custom);
setOperationAction(ISD::SREM, MVT::i128, Custom);
@@ -1796,6 +1617,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
}
+ // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+ // is. We should promote the value to 64-bits to solve this.
+ // This is what the CRT headers do - `fmodf` is an inline header
+ // function casting to f64 and calling `fmod`.
+ if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
+ for (ISD::NodeType Op :
+ {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
+ ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1827,13 +1659,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::MSCATTER);
setTargetDAGCombine(ISD::MGATHER);
- computeRegisterProperties(Subtarget->getRegisterInfo());
+ computeRegisterProperties(Subtarget.getRegisterInfo());
MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
MaxStoresPerMemsetOptSize = 8;
@@ -1843,9 +1674,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxStoresPerMemmoveOptSize = 4;
setPrefLoopAlignment(4); // 2^4 bytes.
- // A predictable cmov does not hurt on an in-order CPU.
- // FIXME: Use a CPU attribute to trigger this, not a CPU model.
- PredictableSelectIsExpensive = !Subtarget->isAtom();
+ // An out-of-order CPU can speculatively execute past a predictable branch,
+ // but a conditional move could be stalled by an expensive earlier operation.
+ PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
@@ -1854,7 +1685,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
- return Subtarget->isTargetMachO() && Subtarget->is64Bit();
+ return Subtarget.isTargetMachO() && Subtarget.is64Bit();
}
TargetLoweringBase::LegalizeTypeAction
@@ -1867,24 +1698,25 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
-EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
+ LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
- return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+ return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
if (VT.isSimple()) {
MVT VVT = VT.getSimpleVT();
const unsigned NumElts = VVT.getVectorNumElements();
- const MVT EltVT = VVT.getVectorElementType();
+ MVT EltVT = VVT.getVectorElementType();
if (VVT.is512BitVector()) {
- if (Subtarget->hasAVX512())
+ if (Subtarget.hasAVX512())
if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
EltVT == MVT::f32 || EltVT == MVT::f64)
switch(NumElts) {
case 8: return MVT::v8i1;
case 16: return MVT::v16i1;
}
- if (Subtarget->hasBWI())
+ if (Subtarget.hasBWI())
if (EltVT == MVT::i8 || EltVT == MVT::i16)
switch(NumElts) {
case 32: return MVT::v32i1;
@@ -1892,23 +1724,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
}
}
- if (VVT.is256BitVector() || VVT.is128BitVector()) {
- if (Subtarget->hasVLX())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 2: return MVT::v2i1;
- case 4: return MVT::v4i1;
- case 8: return MVT::v8i1;
- }
- if (Subtarget->hasBWI() && Subtarget->hasVLX())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- case 32: return MVT::v32i1;
- }
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
+ return MVT::getVectorVT(MVT::i1, NumElts);
+
+ if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
+ EVT LegalVT = getTypeToTransformTo(Context, VT);
+ EltVT = LegalVT.getVectorElementType().getSimpleVT();
}
+
+ if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
+ switch(NumElts) {
+ case 2: return MVT::v2i1;
+ case 4: return MVT::v4i1;
+ case 8: return MVT::v8i1;
+ }
}
return VT.changeVectorElementTypeToInteger();
@@ -1945,7 +1774,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
/// are at 4-byte boundaries.
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
unsigned TyAlign = DL.getABITypeAlignment(Ty);
if (TyAlign > 8)
@@ -1954,7 +1783,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
}
unsigned Align = 4;
- if (Subtarget->hasSSE1())
+ if (Subtarget.hasSSE1())
getMaxByValAlign(Ty, Align);
return Align;
}
@@ -1977,35 +1806,40 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
bool MemcpyStrSrc,
MachineFunction &MF) const {
const Function *F = MF.getFunction();
- if ((!IsMemset || ZeroMemset) &&
- !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
- (!Subtarget->isUnalignedMem16Slow() ||
+ (!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
- if (Size >= 32) {
- // FIXME: Check if unaligned 32-byte accesses are slow.
- if (Subtarget->hasInt256())
- return MVT::v8i32;
- if (Subtarget->hasFp256())
- return MVT::v8f32;
+ // FIXME: Check if unaligned 32-byte accesses are slow.
+ if (Size >= 32 && Subtarget.hasAVX()) {
+ // Although this isn't a well-supported type for AVX1, we'll let
+ // legalization and shuffle lowering produce the optimal codegen. If we
+ // choose an optimal type with a vector element larger than a byte,
+ // getMemsetStores() may create an intermediate splat (using an integer
+ // multiply) before we splat as a vector.
+ return MVT::v32i8;
}
- if (Subtarget->hasSSE2())
- return MVT::v4i32;
- if (Subtarget->hasSSE1())
+ if (Subtarget.hasSSE2())
+ return MVT::v16i8;
+ // TODO: Can SSE1 handle a byte vector?
+ if (Subtarget.hasSSE1())
return MVT::v4f32;
- } else if (!MemcpyStrSrc && Size >= 8 &&
- !Subtarget->is64Bit() &&
- Subtarget->hasSSE2()) {
+ } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
+ !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
+ // Also, do not use f64 to lower memset unless this is a memset of zeros.
+ // The gymnastics of splatting a byte value into an XMM register and then
+ // only using 8-byte stores (because this is a CPU with slow unaligned
+ // 16-byte accesses) makes that a loser.
return MVT::f64;
}
}
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
- if (Subtarget->is64Bit() && Size >= 8)
+ if (Subtarget.is64Bit() && Size >= 8)
return MVT::i64;
return MVT::i32;
}
@@ -2030,10 +1864,10 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
*Fast = true;
break;
case 128:
- *Fast = !Subtarget->isUnalignedMem16Slow();
+ *Fast = !Subtarget.isUnalignedMem16Slow();
break;
case 256:
- *Fast = !Subtarget->isUnalignedMem32Slow();
+ *Fast = !Subtarget.isUnalignedMem32Slow();
break;
// TODO: What about AVX-512 (512-bit) accesses?
}
@@ -2048,8 +1882,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned X86TargetLowering::getJumpTableEncoding() const {
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
// symbol.
- if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
- Subtarget->isPICStyleGOT())
+ if (isPositionIndependent() && Subtarget.isPICStyleGOT())
return MachineJumpTableInfo::EK_Custom32;
// Otherwise, use the normal jump table encoding heuristics.
@@ -2057,15 +1890,14 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
}
bool X86TargetLowering::useSoftFloat() const {
- return Subtarget->useSoftFloat();
+ return Subtarget.useSoftFloat();
}
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
unsigned uid,MCContext &Ctx) const{
- assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
- Subtarget->isPICStyleGOT());
+ assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
return MCSymbolRefExpr::create(MBB->getSymbol(),
@@ -2075,7 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
- if (!Subtarget->is64Bit())
+ if (!Subtarget.is64Bit())
// This doesn't have SDLoc associated with it, but is not really the
// same as a Register.
return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
@@ -2089,7 +1921,7 @@ const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
// X86-64 uses RIP relative addressing based on the jump table label.
- if (Subtarget->isPICStyleRIPRel())
+ if (Subtarget.isPICStyleRIPRel())
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
// Otherwise, the reference is relative to the PIC base.
@@ -2105,7 +1937,7 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
default:
return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
- RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+ RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
case MVT::x86mmx:
RRC = &X86::VR64RegClass;
@@ -2121,47 +1953,76 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
return std::make_pair(RRC, Cost);
}
-bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
- unsigned &Offset) const {
- if (!Subtarget->isTargetLinux())
- return false;
+unsigned X86TargetLowering::getAddressSpace() const {
+ if (Subtarget.is64Bit())
+ return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
+ return 256;
+}
- if (Subtarget->is64Bit()) {
- // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
- Offset = 0x28;
- if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
- AddressSpace = 256;
- else
- AddressSpace = 257;
- } else {
- // %gs:0x14 on i386
- Offset = 0x14;
- AddressSpace = 256;
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // glibc has a special slot for the stack guard in tcbhead_t, use it instead
+ // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
+ if (!Subtarget.isTargetGlibc())
+ return TargetLowering::getIRStackGuard(IRB);
+
+ // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x14 on i386
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+ unsigned AddressSpace = getAddressSpace();
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+void X86TargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal("__security_cookie",
+ Type::getInt8PtrTy(M.getContext()));
+
+ // MSVC CRT has a function to validate security cookie.
+ auto *SecurityCheckCookie = cast<Function>(
+ M.getOrInsertFunction("__security_check_cookie",
+ Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()), nullptr));
+ SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
+ SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ return;
}
- return true;
+ // glibc has a special slot for the stack guard.
+ if (Subtarget.isTargetGlibc())
+ return;
+ TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
+ // MSVC CRT has a global variable holding security cookie.
+ if (Subtarget.getTargetTriple().isOSMSVCRT())
+ return M.getGlobalVariable("__security_cookie");
+ return TargetLowering::getSDagStackGuard(M);
+}
+
+Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget.getTargetTriple().isOSMSVCRT())
+ return M.getFunction("__security_check_cookie");
+ return TargetLowering::getSSPStackGuardCheck(M);
}
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
- if (!Subtarget->isTargetAndroid())
+ if (!Subtarget.isTargetAndroid())
return TargetLowering::getSafeStackPointerLocation(IRB);
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
unsigned AddressSpace, Offset;
- if (Subtarget->is64Bit()) {
- // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
- Offset = 0x48;
- if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
- AddressSpace = 256;
- else
- AddressSpace = 257;
- } else {
- // %gs:0x24 on i386
- Offset = 0x24;
- AddressSpace = 256;
- }
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x24 on i386
+ Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ AddressSpace = getAddressSpace();
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
@@ -2194,11 +2055,11 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
}
SDValue
-X86TargetLowering::LowerReturn(SDValue Chain,
- CallingConv::ID CallConv, bool isVarArg,
+X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- SDLoc dl, SelectionDAG &DAG) const {
+ const SDLoc &dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
@@ -2214,10 +2075,10 @@ X86TargetLowering::LowerReturn(SDValue Chain,
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
- MVT::i16));
+ MVT::i32));
// Copy the result values into the output registers.
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue ValToCopy = OutVals[i];
@@ -2244,14 +2105,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+ (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
// Likewise we can't return F64 values with SSE1 only. gcc does so, but
// llvm-gcc has never done it right and no one has noticed, so this
// should be OK for now.
if (ValVT == MVT::f64 &&
- (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+ (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
@@ -2269,7 +2130,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
// which is returned in RAX / RDX.
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
if (ValVT == MVT::x86mmx) {
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
@@ -2277,7 +2138,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
- if (!Subtarget->hasSSE2())
+ if (!Subtarget.hasSSE2())
ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
}
}
@@ -2288,6 +2149,9 @@ X86TargetLowering::LowerReturn(SDValue Chain,
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
// All x86 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
// We saved the argument into a virtual register in the entry block,
@@ -2298,11 +2162,30 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
- SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+ // When we have both sret and another return value, we should use the
+ // original Chain stored in RetOps[0], instead of the current Chain updated
+ // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+ // For the case of sret and another return value, we have
+ // Chain_0 at the function entry
+ // Chain_1 = getCopyToReg(Chain_0) in the above loop
+ // If we use Chain_1 in getCopyFromReg, we will have
+ // Val = getCopyFromReg(Chain_1)
+ // Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+ // getCopyToReg(Chain_0) will be glued together with
+ // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+ // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+ // Data dependency from Unit B to Unit A due to usage of Val in
+ // getCopyToReg(Chain_1, Val)
+ // Chain dependency from Unit A to Unit B
+
+ // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+ SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
getPointerTy(MF.getDataLayout()));
unsigned RetValReg
- = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+ = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
@@ -2312,7 +2195,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
}
- const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
@@ -2337,9 +2220,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
- if (N->getNumValues() != 1)
- return false;
- if (!N->hasNUsesOfValue(1, 0))
+ if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
@@ -2375,15 +2256,19 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
return true;
}
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
- ISD::NodeType ExtendKind) const {
- MVT ReturnMVT;
- // TODO: Is this also valid on 32-bit?
- if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ MVT ReturnMVT = MVT::i32;
+
+ bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
+ if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
+ // The ABI does not require i1, i8 or i16 to be extended.
+ //
+ // On Darwin, there is code in the wild relying on Clang's old behaviour of
+ // always extending i8/i16 return values, so keep doing that for now.
+ // (PR26665).
ReturnMVT = MVT::i8;
- else
- ReturnMVT = MVT::i32;
+ }
EVT MinVT = getRegisterType(Context, ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
@@ -2392,16 +2277,14 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
-SDValue
-X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+SDValue X86TargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- bool Is64Bit = Subtarget->is64Bit();
+ bool Is64Bit = Subtarget.is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
@@ -2413,7 +2296,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
- ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
@@ -2422,6 +2305,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
+ if (!Subtarget.hasX87())
+ report_fatal_error("X87 register return with X87 disabled");
CopyVT = MVT::f80;
RoundAfterCopy = (CopyVT != VA.getLocVT());
}
@@ -2492,10 +2377,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
/// Make a copy of an aggregate at address specified by "Src" to address
/// "Dst" with size and alignment information specified by the specific
/// parameter attribute. The copy will be passed as a byval function parameter.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
- ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+ SDValue Chain, ISD::ArgFlagsTy Flags,
+ SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -2549,13 +2433,11 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
}
SDValue
-X86TargetLowering::LowerMemArgument(SDValue Chain,
- CallingConv::ID CallConv,
+X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl, SelectionDAG &DAG,
+ const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
- MachineFrameInfo *MFI,
- unsigned i) const {
+ MachineFrameInfo *MFI, unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
bool AlwaysUseMutable = shouldGuaranteeTCO(
@@ -2602,6 +2484,14 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
} else {
int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
VA.getLocMemOffset(), isImmutable);
+
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI->setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI->setObjectSExt(FI, true);
+ }
+
// Adjust SP offset of interrupt parameter.
if (CallConv == CallingConv::X86_INTR) {
MFI->setObjectOffset(FI, Offset);
@@ -2610,8 +2500,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
- false, false, 0);
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
return ExtendedInMem ?
DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
}
@@ -2619,10 +2508,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
- const X86Subtarget *Subtarget) {
- assert(Subtarget->is64Bit());
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
- if (Subtarget->isCallingConvWin64(CallConv)) {
+ if (Subtarget.isCallingConvWin64(CallConv)) {
static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
@@ -2638,9 +2527,9 @@ static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
CallingConv::ID CallConv,
- const X86Subtarget *Subtarget) {
- assert(Subtarget->is64Bit());
- if (Subtarget->isCallingConvWin64(CallConv)) {
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.is64Bit());
+ if (Subtarget.isCallingConvWin64(CallConv)) {
// The XMM registers which might contain var arg parameters are shadowed
// in their paired GPR. So we only need to save the GPR to their home
// slots.
@@ -2650,10 +2539,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
const Function *Fn = MF.getFunction();
bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
- bool isSoftFloat = Subtarget->useSoftFloat();
+ bool isSoftFloat = Subtarget.useSoftFloat();
assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
- if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+ if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
@@ -2667,21 +2556,21 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Function* Fn = MF.getFunction();
+ const Function *Fn = MF.getFunction();
if (Fn->hasExternalLinkage() &&
- Subtarget->isTargetCygMing() &&
+ Subtarget.isTargetCygMing() &&
Fn->getName() == "main")
FuncInfo->setForceFramePointer(true);
MachineFrameInfo *MFI = MF.getFrameInfo();
- bool Is64Bit = Subtarget->is64Bit();
- bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
@@ -2778,13 +2667,18 @@ SDValue X86TargetLowering::LowerFormalArguments(
// If value is passed via pointer - do a load.
if (VA.getLocInfo() == CCValAssign::Indirect)
- ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
- MachinePointerInfo(), false, false, false, 0);
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
InVals.push_back(ArgValue);
}
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // Swift calling convention does not require we copy the sret argument
+ // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
+ if (CallConv == CallingConv::Swift)
+ continue;
+
// All x86 ABIs require that for returning structs by value we copy the
// sret argument into %rax/%eax (depending on ABI) for the return. Save
// the argument into a virtual register so that we can access it from the
@@ -2819,7 +2713,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
// Figure out if XMM registers are in use.
- assert(!(Subtarget->useSoftFloat() &&
+ assert(!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
@@ -2831,7 +2725,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
- assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
// Gather all the live in physical registers.
@@ -2865,7 +2759,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
} else {
// For X86-64, if there are vararg parameters that are passed via
// registers, then we must store them to their spots on the stack so
- // they may be loaded by deferencing the result of va_next.
+ // they may be loaded by dereferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
@@ -2884,8 +2778,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(),
- FuncInfo->getRegSaveFrameIndex(), Offset),
- false, false, 0);
+ FuncInfo->getRegSaveFrameIndex(), Offset));
MemOps.push_back(Store);
Offset += 8;
}
@@ -2913,13 +2806,13 @@ SDValue X86TargetLowering::LowerFormalArguments(
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
- if (Subtarget->hasAVX512() &&
+ if (Subtarget.hasAVX512() &&
(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
- else if (Subtarget->hasAVX())
+ else if (Subtarget.hasAVX())
VecVT = MVT::v8f32;
- else if (Subtarget->hasSSE2())
+ else if (Subtarget.hasSSE2())
VecVT = MVT::v4f32;
// We forward some GPRs and some vector types.
@@ -2960,8 +2853,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
- !Subtarget->getTargetTriple().isOSMSVCRT() &&
- argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
+ argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
@@ -2987,7 +2880,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// offset from the bottom of this and each funclet's frame must be the
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
- // space to accomodate holding this slot at the correct offset).
+ // space to accommodate holding this slot at the correct offset).
int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
@@ -2996,12 +2889,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
return Chain;
}
-SDValue
-X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
- SDValue StackPtr, SDValue Arg,
- SDLoc dl, SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const {
+SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+ SDValue Arg, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -3011,24 +2903,20 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
return DAG.getStore(
Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
- false, false, 0);
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
}
/// Emit a load of return address if tail call
/// optimization is performed and it is required.
-SDValue
-X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
- SDValue &OutRetAddr, SDValue Chain,
- bool IsTailCall, bool Is64Bit,
- int FPDiff, SDLoc dl) const {
+SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
+ SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
+ bool Is64Bit, int FPDiff, const SDLoc &dl) const {
// Adjust the Return address stack slot.
EVT VT = getPointerTy(DAG.getDataLayout());
OutRetAddr = getReturnAddressFrameIndex(DAG);
// Load the "old" Return address.
- OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
- false, false, false, 0);
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
return SDValue(OutRetAddr.getNode(), 1);
}
@@ -3037,7 +2925,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
EVT PtrVT, unsigned SlotSize,
- int FPDiff, SDLoc dl) {
+ int FPDiff, const SDLoc &dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
@@ -3047,21 +2935,20 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(), NewReturnAddrFI),
- false, false, 0);
+ DAG.getMachineFunction(), NewReturnAddrFI));
return Chain;
}
/// Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
Mask.push_back(NumElems);
for (unsigned i = 1; i != NumElems; ++i)
Mask.push_back(i);
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
SDValue
@@ -3079,9 +2966,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
- bool Is64Bit = Subtarget->is64Bit();
- bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
- StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
+ bool Is64Bit = Subtarget.is64Bit();
+ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+ StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
@@ -3092,7 +2979,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Attr.getValueAsString() == "true")
isTailCall = false;
- if (Subtarget->isPICStyleGOT() &&
+ if (Subtarget.isPICStyleGOT() &&
!MF.getTarget().Options.GuaranteedTailCallOpt) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
@@ -3195,7 +3082,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -3238,8 +3125,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
Chain = DAG.getStore(
Chain, dl, Arg, SpillSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
- false, false, 0);
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
Arg = SpillSlot;
break;
}
@@ -3273,7 +3159,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
- if (Subtarget->isPICStyleGOT()) {
+ if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
@@ -3314,7 +3200,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
- assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
@@ -3377,8 +3263,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Store relative to framepointer.
MemOpChains2.push_back(DAG.getStore(
ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
- false, false, 0));
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
}
}
@@ -3416,70 +3301,29 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// non-JIT mode.
const GlobalValue *GV = G->getGlobal();
if (!GV->hasDLLImportStorageClass()) {
- unsigned char OpFlags = 0;
- bool ExtraLoad = false;
- unsigned WrapperKind = ISD::DELETED_NODE;
-
- // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
- // external symbols most go through the PLT in PIC mode. If the symbol
- // has hidden or protected visibility, or if it is static or local, then
- // we don't need to use the PLT - we can directly call it.
- if (Subtarget->isTargetELF() &&
- DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
- GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
- OpFlags = X86II::MO_PLT;
- } else if (Subtarget->isPICStyleStubAny() &&
- !GV->isStrongDefinitionForLinker() &&
- (!Subtarget->getTargetTriple().isMacOSX() ||
- Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
- // PC-relative references to external symbols should go through $stub,
- // unless we're building with the leopard linker or later, which
- // automatically synthesizes these stubs.
- OpFlags = X86II::MO_DARWIN_STUB;
- } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
- cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
- // If the function is marked as non-lazy, generate an indirect call
- // which loads from the GOT directly. This avoids runtime overhead
- // at the cost of eager binding (and one extra byte of encoding).
- OpFlags = X86II::MO_GOTPCREL;
- WrapperKind = X86ISD::WrapperRIP;
- ExtraLoad = true;
- }
+ unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
Callee = DAG.getTargetGlobalAddress(
GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
- // Add a wrapper if needed.
- if (WrapperKind != ISD::DELETED_NODE)
+ if (OpFlags == X86II::MO_GOTPCREL) {
+ // Add a wrapper.
Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- // Add extra indirection if needed.
- if (ExtraLoad)
+ getPointerTy(DAG.getDataLayout()), Callee);
+ // Add extra indirection
Callee = DAG.getLoad(
getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
- false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+ }
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- unsigned char OpFlags = 0;
-
- // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
- // external symbols should go through the PLT.
- if (Subtarget->isTargetELF() &&
- DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
- OpFlags = X86II::MO_PLT;
- } else if (Subtarget->isPICStyleStubAny() &&
- (!Subtarget->getTargetTriple().isMacOSX() ||
- Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
- // PC-relative references to external symbols should go through $stub,
- // unless we're building with the leopard linker or later, which
- // automatically synthesizes these stubs.
- OpFlags = X86II::MO_DARWIN_STUB;
- }
+ const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ unsigned char OpFlags =
+ Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
Callee = DAG.getTargetExternalSymbol(
S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
- } else if (Subtarget->isTarget64BitILP32() &&
+ } else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
@@ -3552,7 +3396,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
- !Subtarget->getTargetTriple().isOSMSVCRT() &&
+ !Subtarget.getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
@@ -3562,6 +3406,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
else
NumBytesForCalleeToPop = 0; // Callee pops nothing.
+ if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
+ // No need to reset the stack after the call if the call doesn't return. To
+ // make the MI verify, we'll pretend the callee does it for us.
+ NumBytesForCalleeToPop = NumBytes;
+ }
+
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
@@ -3614,8 +3464,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
@@ -3636,8 +3486,28 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
- const X86InstrInfo *TII) {
+ const X86InstrInfo *TII, const CCValAssign &VA) {
unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+
+ for (;;) {
+ // Look through nodes that don't alter the bits of the incoming value.
+ unsigned Op = Arg.getOpcode();
+ if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+ Arg = Arg.getOperand(0);
+ continue;
+ }
+ if (Op == ISD::TRUNCATE) {
+ const SDValue &TruncInput = Arg.getOperand(0);
+ if (TruncInput.getOpcode() == ISD::AssertZext &&
+ cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+ Arg.getValueType()) {
+ Arg = TruncInput.getOperand(0);
+ continue;
+ }
+ }
+ break;
+ }
+
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
@@ -3647,7 +3517,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
if (!Def)
return false;
if (!Flags.isByVal()) {
- if (!TII->isLoadFromStackSlot(Def, FI))
+ if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
unsigned Opcode = Def->getOpcode();
@@ -3682,7 +3552,20 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
assert(FI != INT_MAX);
if (!MFI->isFixedObjectIndex(FI))
return false;
- return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+
+ if (Offset != MFI->getObjectOffset(FI))
+ return false;
+
+ if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
+ // If the argument location is wider than the argument type, check that any
+ // extension flags match.
+ if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
+ Flags.isSExt() != MFI->isObjectSExt(FI)) {
+ return false;
+ }
+ }
+
+ return Bytes == MFI->getObjectSize(FI);
}
/// Check whether the call is eligible for tail call optimization. Targets
@@ -3708,8 +3591,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
- bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
- bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+ bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
@@ -3728,7 +3611,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
@@ -3739,6 +3622,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
+ LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
@@ -3746,8 +3630,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
return false;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3767,8 +3650,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
}
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
@@ -3777,34 +3659,17 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
}
}
- // If the calling conventions do not match, then we'd better make sure the
- // results are returned in the same way as what the caller expects.
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+ RetCC_X86, RetCC_X86))
+ return false;
+ // The callee has to preserve all registers the caller needs to preserve.
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
- SmallVector<CCValAssign, 16> RVLocs1;
- CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
- *DAG.getContext());
- CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
-
- SmallVector<CCValAssign, 16> RVLocs2;
- CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
- *DAG.getContext());
- CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
-
- if (RVLocs1.size() != RVLocs2.size())
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
- for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
- if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
- return false;
- if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
- return false;
- if (RVLocs1[i].isRegLoc()) {
- if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
- return false;
- } else {
- if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
- return false;
- }
- }
}
unsigned StackArgsSize = 0;
@@ -3815,8 +3680,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
- *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
// Allocate shadow area for Win64
if (IsCalleeWin64)
@@ -3830,7 +3694,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII = Subtarget->getInstrInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
@@ -3839,26 +3703,25 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
return false;
if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
- MFI, MRI, TII))
+ MFI, MRI, TII, VA))
return false;
}
}
}
+ bool PositionIndependent = isPositionIndependent();
// If the tailcall address may be in a register, then make sure it's
// possible to register allocate for it. In 32-bit, the call address can
// only target EAX, EDX, or ECX since the tail call must be scheduled after
// callee-saved registers are restored. These happen to be the same
// registers used to pass 'inreg' arguments so watch out for those.
- if (!Subtarget->is64Bit() &&
- ((!isa<GlobalAddressSDNode>(Callee) &&
- !isa<ExternalSymbolSDNode>(Callee)) ||
- DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+ if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ PositionIndependent)) {
unsigned NumInRegs = 0;
// In PIC we need an extra register to formulate the address computation
// for the callee.
- unsigned MaxInRegs =
- (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+ unsigned MaxInRegs = PositionIndependent ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3874,10 +3737,14 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
}
}
}
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+ return false;
}
bool CalleeWillPop =
- X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+ X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt);
if (unsigned BytesToPop =
@@ -3923,6 +3790,8 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::SHUFP:
case X86ISD::INSERTPS:
case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
case X86ISD::MOVLHPS:
case X86ISD::MOVLHPD:
case X86ISD::MOVHLPS:
@@ -3935,16 +3804,30 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
+ case X86ISD::VBROADCAST:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
+ case X86ISD::VPERMIL2:
case X86ISD::VPERMI:
+ case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
+ case X86ISD::VZEXT_MOVL:
return true;
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+static bool isTargetShuffleVariableMask(unsigned Opcode) {
+ switch (Opcode) {
+ default: return false;
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMILPV:
+ return true;
+ }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue V1, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3959,7 +3842,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2, SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3978,7 +3861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
@@ -4047,17 +3930,20 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
/// \brief Return true if the condition is an unsigned comparison operation.
static bool isX86CCUnsigned(unsigned X86CC) {
switch (X86CC) {
- default: llvm_unreachable("Invalid integer condition!");
- case X86::COND_E: return true;
- case X86::COND_G: return false;
- case X86::COND_GE: return false;
- case X86::COND_L: return false;
- case X86::COND_LE: return false;
- case X86::COND_NE: return true;
- case X86::COND_B: return true;
- case X86::COND_A: return true;
- case X86::COND_BE: return true;
- case X86::COND_AE: return true;
+ default:
+ llvm_unreachable("Invalid integer condition!");
+ case X86::COND_E:
+ case X86::COND_NE:
+ case X86::COND_B:
+ case X86::COND_A:
+ case X86::COND_BE:
+ case X86::COND_AE:
+ return true;
+ case X86::COND_G:
+ case X86::COND_GE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ return false;
}
}
@@ -4080,8 +3966,9 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
-static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
- SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+ bool isFP, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG) {
if (!isFP) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
@@ -4181,24 +4068,50 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!IntrData)
return false;
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.readMem = false;
+ Info.writeMem = false;
+ Info.vol = false;
+ Info.offset = 0;
+
switch (IntrData->Type) {
- case LOADA:
- case LOADU: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(I.getType());
+ case EXPAND_FROM_MEM: {
Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1);
- Info.vol = false;
+ Info.memVT = MVT::getVT(I.getType());
+ Info.align = 1;
Info.readMem = true;
- Info.writeMem = false;
- return true;
+ break;
}
- default:
+ case COMPRESS_TO_MEM: {
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
+ Info.align = 1;
+ Info.writeMem = true;
break;
}
+ case TRUNCATE_TO_MEM_VI8:
+ case TRUNCATE_TO_MEM_VI16:
+ case TRUNCATE_TO_MEM_VI32: {
+ Info.ptrVal = I.getArgOperand(0);
+ MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
+ MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
+ ScalarVT = MVT::i8;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
+ ScalarVT = MVT::i16;
+ else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
+ ScalarVT = MVT::i32;
+
+ Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
+ Info.align = 1;
+ Info.writeMem = true;
+ break;
+ }
+ default:
+ return false;
+ }
- return false;
+ return true;
}
/// Returns true if the target can instruction select the
@@ -4246,12 +4159,24 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
bool X86TargetLowering::isCheapToSpeculateCttz() const {
// Speculate cttz only if we can directly use TZCNT.
- return Subtarget->hasBMI();
+ return Subtarget.hasBMI();
}
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Speculate ctlz only if we can directly use LZCNT.
- return Subtarget->hasLZCNT();
+ return Subtarget.hasLZCNT();
+}
+
+bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+ if (!Subtarget.hasBMI())
+ return false;
+
+ // There are only 32-bit and 64-bit forms for 'andn'.
+ EVT VT = Y.getValueType();
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ return true;
}
/// Return true if every element in Mask, beginning
@@ -4269,11 +4194,26 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) || (Val >= Low && Val < Hi);
}
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+ int Low, int Hi) {
+ for (int M : Mask)
+ if (!isUndefOrInRange(M, Low, Hi))
+ return false;
+ return true;
+}
+
/// Val is either less than zero (undef) or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return (Val < 0 || Val == CmpVal);
}
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+ return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
+}
+
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size]. or is undef.
@@ -4285,6 +4225,17 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
return true;
}
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size], or is undef or is zero.
+static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+ if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
+ return false;
+ return true;
+}
+
/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
/// extract that is suitable for instruction that extract 128 or 256 bit vectors
static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
@@ -4399,9 +4350,8 @@ bool X86::isZeroNode(SDValue Elt) {
// Build a vector of constants
// Use an UNDEF node if MaskElt == -1.
// Spilt 64-bit constants in the 32-bit mode.
-static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
- SelectionDAG &DAG,
- SDLoc dl, bool IsMask = false) {
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
+ const SDLoc &dl, bool IsMask = false) {
SmallVector<SDValue, 32> Ops;
bool Split = false;
@@ -4424,63 +4374,40 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(0, dl, EltVT));
}
- SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
if (Split)
ConstsNode = DAG.getBitcast(VT, ConstsNode);
return ConstsNode;
}
/// Returns a vector of specified type with all zero elements.
-static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
- SelectionDAG &DAG, SDLoc dl) {
- assert(VT.isVector() && "Expected a vector type");
-
- // Always build SSE zero vectors as <4 x i32> bitcasted
- // to their dest type. This ensures they get CSE'd.
+static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
+ VT.getVectorElementType() == MVT::i1) &&
+ "Unexpected vector type");
+
+ // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd. But if the integer type is not
+ // available, use a floating-point +0.0 instead.
SDValue Vec;
- if (VT.is128BitVector()) { // SSE
- if (Subtarget->hasSSE2()) { // SSE2
- SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- } else { // SSE1
- SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
- }
- } else if (VT.is256BitVector()) { // AVX
- if (Subtarget->hasInt256()) { // AVX2
- SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
- } else {
- // 256-bit logic and arithmetic instructions in AVX are all
- // floating-point, no support for integer ops. Emit fp zeroed vectors.
- SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
- }
- } else if (VT.is512BitVector()) { // AVX-512
- SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+ if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
+ Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
} else if (VT.getVectorElementType() == MVT::i1) {
-
- assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
- && "Unexpected vector type");
- assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
- && "Unexpected vector type");
- SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
- SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
- } else
- llvm_unreachable("Unexpected vector type");
-
+ assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
+ "Unexpected vector type");
+ assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
+ "Unexpected vector type");
+ Vec = DAG.getConstant(0, dl, VT);
+ } else {
+ unsigned Num32BitElts = VT.getSizeInBits() / 32;
+ Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
+ }
return DAG.getBitcast(VT, Vec);
}
-static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl,
- unsigned vectorWidth) {
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned vectorWidth) {
assert((vectorWidth == 128 || vectorWidth == 256) &&
"Unsupported vector width");
EVT VT = Vec.getValueType();
@@ -4490,7 +4417,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
VT.getVectorNumElements()/Factor);
// Extract from UNDEF is UNDEF.
- if (Vec.getOpcode() == ISD::UNDEF)
+ if (Vec.isUndef())
return DAG.getUNDEF(ResultVT);
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
@@ -4503,8 +4430,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+ return DAG.getNode(ISD::BUILD_VECTOR,
+ dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -4516,27 +4443,27 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
/// instructions or a simple subregister reference. Idx is an index in the
/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
+static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
assert((Vec.getValueType().is256BitVector() ||
Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+ return extractSubVector(Vec, IdxVal, DAG, dl, 128);
}
/// Generate a DAG to grab 256-bits from a 512-bit vector.
-static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
+static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+ return extractSubVector(Vec, IdxVal, DAG, dl, 256);
}
-static SDValue InsertSubVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl, unsigned vectorWidth) {
+static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl,
+ unsigned vectorWidth) {
assert((vectorWidth == 128 || vectorWidth == 256) &&
"Unsupported vector width");
// Inserting UNDEF is Result
- if (Vec.getOpcode() == ISD::UNDEF)
+ if (Vec.isUndef())
return Result;
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
@@ -4560,8 +4487,8 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
+static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
// For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -4570,7 +4497,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
// extend the subvector to the size of the result vector. Make sure that
// we are not recursing on that node by checking for undef here.
if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
- Result.getOpcode() != ISD::UNDEF) {
+ !Result.isUndef()) {
EVT ResultVT = Result.getValueType();
SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
SDValue Undef = DAG.getUNDEF(ResultVT);
@@ -4607,17 +4534,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
return DAG.getBitcast(ResultVT, Vec256);
}
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
+static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}
/// Insert i1-subvector to i1-vector.
-static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
@@ -4647,43 +4575,71 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
// 3. Subvector should be inserted in the middle (for example v2i1
// to v16i1, index 2)
+ // extend to natively supported kshift
+ MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ MVT WideOpVT = OpVT;
+ if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
+ WideOpVT = MinVT;
+
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
- SDValue Undef = DAG.getUNDEF(OpVT);
- SDValue WideSubVec =
- DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
- if (Vec.isUndef())
- return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ SDValue Undef = DAG.getUNDEF(WideOpVT);
+ SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, SubVec, ZeroIdx);
+
+ // Extract sub-vector if require.
+ auto ExtractSubVec = [&](SDValue V) {
+ return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+ OpVT, V, ZeroIdx);
+ };
+
+ if (Vec.isUndef()) {
+ if (IdxVal != 0) {
+ SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+ }
+ return ExtractSubVec(WideSubVec);
+ }
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
- DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+ DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
+ return ExtractSubVec(Vec);
}
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
- // Merge them together
- return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ // Merge them together, SubVec should be zero extended.
+ WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ SubVec, ZeroIdx);
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+ return ExtractSubVec(Vec);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
// Zero upper bits of the Vec
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
- return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+ return ExtractSubVec(Vec);
}
// Subvector should be inserted in the middle - use shuffle
+ WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
SmallVector<int, 64> Mask;
for (unsigned i = 0; i < NumElems; ++i)
Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
@@ -4695,103 +4651,206 @@ static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
/// instructions. This is used because creating CONCAT_VECTOR nodes of
/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
/// large BUILD_VECTORS.
-static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+ const SDLoc &dl) {
+ SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
}
-static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+ const SDLoc &dl) {
+ SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
}
/// Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
/// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
- SelectionDAG &DAG, SDLoc dl) {
- assert(VT.isVector() && "Expected a vector type");
+static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDLoc &dl) {
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected a 128/256/512-bit vector type");
- SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
+ APInt Ones = APInt::getAllOnesValue(32);
+ unsigned NumElts = VT.getSizeInBits() / 32;
SDValue Vec;
- if (VT.is512BitVector()) {
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
- } else if (VT.is256BitVector()) {
- if (Subtarget->hasInt256()) { // AVX2
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
- } else { // AVX
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
- }
- } else if (VT.is128BitVector()) {
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- } else
- llvm_unreachable("Unexpected vector type");
-
+ if (!Subtarget.hasInt256() && NumElts == 8) {
+ Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
+ Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+ } else {
+ Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+ }
return DAG.getBitcast(VT, Vec);
}
/// Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
- SDValue V2) {
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+ SDValue V1, SDValue V2) {
+ assert(VT.is128BitVector() && "Expected a 128-bit vector type");
unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask;
+ SmallVector<int, 8> Mask(NumElems);
for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
- Mask.push_back(i);
- Mask.push_back(i + NumElems);
+ Mask[i * 2] = i;
+ Mask[i * 2 + 1] = i + NumElems;
}
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
- SDValue V2) {
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+ SDValue V1, SDValue V2) {
+ assert(VT.is128BitVector() && "Expected a 128-bit vector type");
unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask;
+ SmallVector<int, 8> Mask(NumElems);
for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
- Mask.push_back(i + Half);
- Mask.push_back(i + NumElems + Half);
+ Mask[i * 2] = i + Half;
+ Mask[i * 2 + 1] = i + NumElems + Half;
}
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Return a vector_shuffle of the specified vector of zero or undef vector.
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
-static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
bool IsZero,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = V2.getSimpleValueType();
SDValue V1 = IsZero
? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
- unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 16> MaskVec;
- for (unsigned i = 0; i != NumElems; ++i)
+ int NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec(NumElems);
+ for (int i = 0; i != NumElems; ++i)
// If this is the insertion idx, put the low elt of V2 here.
- MaskVec.push_back(i == Idx ? NumElems : i);
- return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
+ MaskVec[i] = (i == Idx) ? NumElems : i;
+ return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
+}
+
+static SDValue peekThroughBitcasts(SDValue V) {
+ while (V.getNode() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+ return V;
+}
+
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+ unsigned MaskEltSizeInBits,
+ SmallVectorImpl<uint64_t> &RawMask) {
+ MaskNode = peekThroughBitcasts(MaskNode);
+
+ MVT VT = MaskNode.getSimpleValueType();
+ assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+
+ // Split an APInt element into MaskEltSizeInBits sized pieces and
+ // insert into the shuffle mask.
+ auto SplitElementToMask = [&](APInt Element) {
+ // Note that this is x86 and so always little endian: the low byte is
+ // the first byte of the mask.
+ int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+ for (int i = 0; i < Split; ++i) {
+ APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
+ Element = Element.lshr(MaskEltSizeInBits);
+ RawMask.push_back(RawElt.getZExtValue());
+ }
+ };
+
+ if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
+ // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+ // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
+ if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
+ return false;
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
+ const APInt &MaskElement = CN->getAPIntValue();
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
+ RawMask.push_back(RawElt.getZExtValue());
+ }
+ }
+ return false;
+ }
+
+ if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
+ MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
+
+ // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+ if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+ return false;
+ unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+
+ SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
+ SplitElementToMask(CN->getAPIntValue());
+ RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+ return true;
+ }
+ return false;
+ }
+
+ if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ // We can always decode if the buildvector is all zero constants,
+ // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
+ if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
+ RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
+ return true;
+ }
+
+ // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+ if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+ return false;
+
+ for (SDValue Op : MaskNode->ops()) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
+ SplitElementToMask(CN->getAPIntValue());
+ else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
+ SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
+ MaskNode = peekThroughBitcasts(MaskNode);
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return nullptr;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return nullptr;
+
+ return dyn_cast<Constant>(MaskCP->getConstVal());
}
/// Calculates the shuffle mask corresponding to the target-specific opcode.
-/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
-/// uses one source. Note that this will set IsUnary for shuffles which use a
-/// single input multiple times, and in those cases it will
-/// adjust the mask to only have indices within that single input.
+/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
+/// operands in \p Ops, and returns true.
+/// Sets \p IsUnary to true if only one source is used. Note that this will set
+/// IsUnary for shuffles which use a single input multiple times, and in those
+/// cases it will adjust the mask to only have indices within that single input.
+/// It is an error to call this with non-empty Mask/Ops vectors.
static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+ SmallVectorImpl<SDValue> &Ops,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
SDValue ImmN;
+ assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
+ assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
+
IsUnary = false;
bool IsFakeUnary = false;
switch(N->getOpcode()) {
@@ -4826,9 +4885,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::PALIGNR:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
+ case X86ISD::VSHLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ ImmN = N->getOperand(N->getNumOperands() - 1);
+ DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VSRLDQ:
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ ImmN = N->getOperand(N->getNumOperands() - 1);
+ DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
ImmN = N->getOperand(N->getNumOperands()-1);
@@ -4845,70 +4917,51 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
+ case X86ISD::VZEXT_MOVL:
+ DecodeZeroMoveLowMask(VT, Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VBROADCAST: {
+ // We only decode broadcasts of same-sized vectors at the moment.
+ if (N->getOperand(0).getValueType() == VT) {
+ DecodeVectorBroadcast(VT, Mask);
+ IsUnary = true;
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMILPV: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(1);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+ DecodeVPERMILPMask(VT, RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ DecodeVPERMILPMask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
case X86ISD::PSHUFB: {
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
- while (MaskNode->getOpcode() == ISD::BITCAST)
- MaskNode = MaskNode->getOperand(0);
-
- if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
- // If we have a build-vector, then things are easy.
- MVT VT = MaskNode.getSimpleValueType();
- assert(VT.isVector() &&
- "Can't produce a non-vector with a build_vector!");
- if (!VT.isInteger())
- return false;
-
- int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
-
- SmallVector<uint64_t, 32> RawMask;
- for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
- SDValue Op = MaskNode->getOperand(i);
- if (Op->getOpcode() == ISD::UNDEF) {
- RawMask.push_back((uint64_t)SM_SentinelUndef);
- continue;
- }
- auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
- if (!CN)
- return false;
- APInt MaskElement = CN->getAPIntValue();
-
- // We now have to decode the element which could be any integer size and
- // extract each byte of it.
- for (int j = 0; j < NumBytesPerElement; ++j) {
- // Note that this is x86 and so always little endian: the low byte is
- // the first byte of the mask.
- RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
- MaskElement = MaskElement.lshr(8);
- }
- }
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
DecodePSHUFBMask(RawMask, Mask);
break;
}
-
- auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
- if (!MaskLoad)
- return false;
-
- SDValue Ptr = MaskLoad->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
- return false;
-
- if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
DecodePSHUFBMask(C, Mask);
break;
}
-
return false;
}
case X86ISD::VPERMI:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
@@ -4937,110 +4990,63 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::MOVLPS:
// Not yet implemented
return false;
+ case X86ISD::VPERMIL2: {
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ SDValue MaskNode = N->getOperand(2);
+ SDValue CtrlNode = N->getOperand(3);
+ if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
+ unsigned CtrlImm = CtrlOp->getZExtValue();
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+ DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+ break;
+ }
+ }
+ return false;
+ }
+ case X86ISD::VPPERM: {
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ SDValue MaskNode = N->getOperand(2);
+ SmallVector<uint64_t, 32> RawMask;
+ if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+ DecodeVPPERMMask(RawMask, Mask);
+ break;
+ }
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ DecodeVPPERMMask(C, Mask);
+ break;
+ }
+ return false;
+ }
case X86ISD::VPERMV: {
IsUnary = true;
+ // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
+ Ops.push_back(N->getOperand(1));
SDValue MaskNode = N->getOperand(0);
- while (MaskNode->getOpcode() == ISD::BITCAST)
- MaskNode = MaskNode->getOperand(0);
-
- unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
SmallVector<uint64_t, 32> RawMask;
- if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
- // If we have a build-vector, then things are easy.
- assert(MaskNode.getSimpleValueType().isInteger() &&
- MaskNode.getSimpleValueType().getVectorNumElements() ==
- VT.getVectorNumElements());
-
- for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
- SDValue Op = MaskNode->getOperand(i);
- if (Op->getOpcode() == ISD::UNDEF)
- RawMask.push_back((uint64_t)SM_SentinelUndef);
- else if (isa<ConstantSDNode>(Op)) {
- APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
- RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
- } else
- return false;
- }
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
DecodeVPERMVMask(RawMask, Mask);
break;
}
- if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
- unsigned NumEltsInMask = MaskNode->getNumOperands();
- MaskNode = MaskNode->getOperand(0);
- if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
- APInt MaskEltValue = CN->getAPIntValue();
- for (unsigned i = 0; i < NumEltsInMask; ++i)
- RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
- DecodeVPERMVMask(RawMask, Mask);
- break;
- }
- // It may be a scalar load
- }
-
- auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
- if (!MaskLoad)
- return false;
-
- SDValue Ptr = MaskLoad->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
- return false;
-
- if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
DecodeVPERMVMask(C, VT, Mask);
break;
}
return false;
}
case X86ISD::VPERMV3: {
- IsUnary = false;
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
+ // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
+ Ops.push_back(N->getOperand(0));
+ Ops.push_back(N->getOperand(2));
SDValue MaskNode = N->getOperand(1);
- while (MaskNode->getOpcode() == ISD::BITCAST)
- MaskNode = MaskNode->getOperand(1);
-
- if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
- // If we have a build-vector, then things are easy.
- assert(MaskNode.getSimpleValueType().isInteger() &&
- MaskNode.getSimpleValueType().getVectorNumElements() ==
- VT.getVectorNumElements());
-
- SmallVector<uint64_t, 32> RawMask;
- unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
-
- for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
- SDValue Op = MaskNode->getOperand(i);
- if (Op->getOpcode() == ISD::UNDEF)
- RawMask.push_back((uint64_t)SM_SentinelUndef);
- else {
- auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
- if (!CN)
- return false;
- APInt MaskElement = CN->getAPIntValue();
- RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
- }
- }
- DecodeVPERMV3Mask(RawMask, Mask);
- break;
- }
-
- auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
- if (!MaskLoad)
- return false;
-
- SDValue Ptr = MaskLoad->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
- return false;
-
- if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
DecodeVPERMV3Mask(C, VT, Mask);
break;
}
@@ -5055,8 +5061,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
// Check if we're getting a shuffle mask with zero'd elements.
if (!AllowSentinelZero)
- if (std::any_of(Mask.begin(), Mask.end(),
- [](int M){ return M == SM_SentinelZero; }))
+ if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
@@ -5067,6 +5072,123 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
if (M >= (int)Mask.size())
M -= Mask.size();
+ // If we didn't already add operands in the opcode-specific code, default to
+ // adding 1 or 2 operands starting at 0.
+ if (Ops.empty()) {
+ Ops.push_back(N->getOperand(0));
+ if (!IsUnary || IsFakeUnary)
+ Ops.push_back(N->getOperand(1));
+ }
+
+ return true;
+}
+
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that are known to be zero
+/// (not just zeroable) from their inputs.
+/// Returns true if the target shuffle mask was decoded.
+static bool setTargetShuffleZeroElements(SDValue N,
+ SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops) {
+ bool IsUnary;
+ if (!isTargetShuffle(N.getOpcode()))
+ return false;
+ if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
+ Mask, IsUnary))
+ return false;
+
+ SDValue V1 = Ops[0];
+ SDValue V2 = IsUnary ? V1 : Ops[1];
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+
+ // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+ if (M < 0)
+ continue;
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // We are referencing an UNDEF input.
+ if (V.isUndef()) {
+ Mask[i] = SM_SentinelUndef;
+ continue;
+ }
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the (larger) source
+ // element must be UNDEF/ZERO.
+ // TODO: Is it worth testing the individual bits of a constant?
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ Mask[i] = SM_SentinelUndef;
+ else if (X86::isZeroNode(Op))
+ Mask[i] = SM_SentinelZero;
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be all UNDEF or all ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ Mask[i] = SM_SentinelUndef;
+ else if (AllZero)
+ Mask[i] = SM_SentinelZero;
+ continue;
+ }
+ }
+
+ return true;
+}
+
+/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
+/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
+/// remaining input indices in case we now have a unary shuffle and adjust the
+/// Op0/Op1 inputs accordingly.
+/// Returns true if the target shuffle mask was decoded.
+static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+ SmallVectorImpl<int> &Mask) {
+ SmallVector<SDValue, 2> Ops;
+ if (!setTargetShuffleZeroElements(Op, Mask, Ops))
+ return false;
+
+ int NumElts = Mask.size();
+ bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
+ return 0 <= Idx && Idx < NumElts;
+ });
+ bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
+ [NumElts](int Idx) { return NumElts <= Idx; });
+
+ Op0 = Op0InUse ? Ops[0] : SDValue();
+ Op1 = Op1InUse ? Ops[1] : SDValue();
+
+ // We're only using Op1 - commute the mask and inputs.
+ if (!Op0InUse && Op1InUse) {
+ for (int &M : Mask)
+ if (NumElts <= M)
+ M -= NumElts;
+ Op0 = Op1;
+ Op1 = SDValue();
+ }
+
return true;
}
@@ -5097,19 +5219,24 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
MVT ShufVT = V.getSimpleValueType();
+ MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
- if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary))
+ if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
+ if (Elt == SM_SentinelZero)
+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
+ : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
if (Elt == SM_SentinelUndef)
- return DAG.getUNDEF(ShufVT.getVectorElementType());
+ return DAG.getUNDEF(ShufSVT);
assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
- SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1);
+ SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
Depth+1);
}
@@ -5138,7 +5265,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
- const X86Subtarget* Subtarget,
+ const X86Subtarget &Subtarget,
const TargetLowering &TLI) {
if (NumNonZero > 8)
return SDValue();
@@ -5148,7 +5275,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
bool First = true;
// SSE4.1 - use PINSRB to insert each byte directly.
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
for (unsigned i = 0; i < 16; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
if (isNonZero) {
@@ -5208,7 +5335,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
- const X86Subtarget* Subtarget,
+ const X86Subtarget &Subtarget,
const TargetLowering &TLI) {
if (NumNonZero > 4)
return SDValue();
@@ -5237,13 +5364,13 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
const TargetLowering &TLI) {
// Find all zeroable elements.
std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
SDValue Elt = Op->getOperand(i);
- Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+ Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
@@ -5296,12 +5423,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
// Let the shuffle legalizer deal with blend operations.
SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
- V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
- return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
- if (!Subtarget->hasSSE41())
+ if (!Subtarget.hasSSE41())
return SDValue();
SDValue V2 = Elt.getOperand(0);
@@ -5326,9 +5453,9 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
assert(V1.getNode() && "Expected at least two non-zero elements!");
if (V1.getSimpleValueType() != MVT::v4f32)
- V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
if (V2.getSimpleValueType() != MVT::v4f32)
- V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
unsigned ZMask = Zeroable.to_ulong();
@@ -5342,11 +5469,11 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
}
/// Return a vector logical shift node.
-static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
- unsigned NumBits, SelectionDAG &DAG,
- const TargetLowering &TLI, SDLoc dl) {
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
+ SelectionDAG &DAG, const TargetLowering &TLI,
+ const SDLoc &dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- MVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v16i8;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
@@ -5355,8 +5482,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
-static SDValue
-LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
+static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
+ SelectionDAG &DAG) {
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
@@ -5418,12 +5545,11 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
- LD->getPointerInfo().getWithOffset(StartOffset),
- false, false, false, 0);
+ LD->getPointerInfo().getWithOffset(StartOffset));
SmallVector<int, 8> Mask(NumElems, EltNo);
- return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
}
return SDValue();
@@ -5433,55 +5559,103 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
-/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
-///
-/// FIXME: we'd also like to handle the case where the last elements are zero
-/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
-/// There's even a handy isZeroNode for that purpose.
+/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
unsigned NumElems = Elts.size();
- LoadSDNode *LDBase = nullptr;
- unsigned LastLoadedElt = -1U;
+ int LastLoadedElt = -1;
+ SmallBitVector LoadMask(NumElems, false);
+ SmallBitVector ZeroMask(NumElems, false);
+ SmallBitVector UndefMask(NumElems, false);
- // For each element in the initializer, see if we've found a load or an undef.
- // If we don't find an initial load element, or later load elements are
- // non-consecutive, bail out.
+ // For each element in the initializer, see if we've found a load, zero or an
+ // undef.
for (unsigned i = 0; i < NumElems; ++i) {
- SDValue Elt = Elts[i];
- // Look through a bitcast.
- if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
- Elt = Elt.getOperand(0);
- if (!Elt.getNode() ||
- (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (!Elt.getNode())
return SDValue();
- if (!LDBase) {
- if (Elt.getNode()->getOpcode() == ISD::UNDEF)
- return SDValue();
- LDBase = cast<LoadSDNode>(Elt.getNode());
- LastLoadedElt = i;
- continue;
- }
- if (Elt.getOpcode() == ISD::UNDEF)
- continue;
- LoadSDNode *LD = cast<LoadSDNode>(Elt);
- EVT LdVT = Elt.getValueType();
- // Each loaded element must be the correct fractional portion of the
- // requested vector load.
- if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
- return SDValue();
- if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
+ if (Elt.isUndef())
+ UndefMask[i] = true;
+ else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
+ ZeroMask[i] = true;
+ else if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ LoadMask[i] = true;
+ LastLoadedElt = i;
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ return SDValue();
+ } else
return SDValue();
- LastLoadedElt = i;
}
+ assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+ "Incomplete element masks");
+
+ // Handle Special Cases - all undef or undef/zero.
+ if (UndefMask.count() == NumElems)
+ return DAG.getUNDEF(VT);
+
+ // FIXME: Should we return this as a BUILD_VECTOR instead?
+ if ((ZeroMask | UndefMask).count() == NumElems)
+ return VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ int FirstLoadedElt = LoadMask.find_first();
+ SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
+ LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
+ EVT LDBaseVT = EltBase.getValueType();
+
+ // Consecutive loads can contain UNDEFS but not ZERO elements.
+ // Consecutive loads with UNDEFs and ZEROs elements require a
+ // an additional shuffle stage to clear the ZERO elements.
+ bool IsConsecutiveLoad = true;
+ bool IsConsecutiveLoadWithZeros = true;
+ for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
+ if (LoadMask[i]) {
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ LoadSDNode *LD = cast<LoadSDNode>(Elt);
+ if (!DAG.areNonVolatileConsecutiveLoads(
+ LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
+ i - FirstLoadedElt)) {
+ IsConsecutiveLoad = false;
+ IsConsecutiveLoadWithZeros = false;
+ break;
+ }
+ } else if (ZeroMask[i]) {
+ IsConsecutiveLoad = false;
+ }
+ }
+
+ auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+ auto MMOFlags = LDBase->getMemOperand()->getFlags();
+ assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
+ "Cannot merge volatile loads.");
+ SDValue NewLd =
+ DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ }
+
+ return NewLd;
+ };
+ // LOAD - all consecutive load/undefs (must start/end with a load).
// If we have found an entire vector of loads and undefs, then return a large
- // load of the entire vector width starting at the base pointer. If we found
- // consecutive loads for the low half, generate a vzext_load node.
- if (LastLoadedElt == NumElems - 1) {
+ // load of the entire vector width starting at the base pointer.
+ // If the vector contains zeros, then attempt to shuffle those elements.
+ if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
assert(LDBase && "Did not find base load for merging consecutive loads");
EVT EltVT = LDBase->getValueType(0);
// Ensure that the input vector size for the merged loads matches the
@@ -5489,72 +5663,93 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
return SDValue();
- if (isAfterLegalize &&
- !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+ if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
- SDValue NewLd = SDValue();
+ if (IsConsecutiveLoad)
+ return CreateLoad(VT, LDBase);
+
+ // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
+ // vector and a zero vector to clear out the zero elements.
+ if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+ SmallVector<int, 4> ClearMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (ZeroMask[i])
+ ClearMask[i] = i + NumElems;
+ else if (LoadMask[i])
+ ClearMask[i] = i;
+ }
+ SDValue V = CreateLoad(VT, LDBase);
+ SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
+ }
+ }
+
+ int LoadSize =
+ (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+
+ // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
+ ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
+ if (TLI.isTypeLegal(VecVT)) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
- NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(), LDBase->isVolatile(),
- LDBase->isNonTemporal(), LDBase->isInvariant(),
- LDBase->getAlignment());
+ // Make sure the newly-created LOAD is in the same position as LDBase in
+ // terms of dependency. We create a TokenFactor for LDBase and ResNode,
+ // and update uses of LDBase's output chain to use the TokenFactor.
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
+ return DAG.getBitcast(VT, ResNode);
}
-
- return NewLd;
}
- //TODO: The code below fires only for for loading the low v2i32 / v2f32
- //of a v4i32 / v4f32. It's probably worth generalizing.
- EVT EltVT = VT.getVectorElementType();
- if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
- DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
- SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
- SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
- LDBase->getPointerInfo(),
- LDBase->getAlignment(),
- false/*isVolatile*/, true/*ReadMem*/,
- false/*WriteMem*/);
-
- // Make sure the newly-created LOAD is in the same position as LDBase in
- // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
- // update uses of LDBase's output chain to use the TokenFactor.
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(ResNode.getNode(), 1));
+ // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
+ ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
+ if (TLI.isTypeLegal(VecVT)) {
+ SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
+ : DAG.getBitcast(VecSVT, EltBase);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
+ return DAG.getBitcast(VT, V);
}
-
- return DAG.getBitcast(VT, ResNode);
}
+
return SDValue();
}
-/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
-/// to generate a splat value for the following cases:
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
/// a scalar load, or a constant.
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
-static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
// instructions, but there's less potential gain for only 128-bit vectors.
- if (!Subtarget->hasAVX())
+ if (!Subtarget.hasAVX())
return SDValue();
MVT VT = Op.getSimpleValueType();
@@ -5604,12 +5799,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
Sc.getOpcode() != ISD::BUILD_VECTOR) {
- if (!Subtarget->hasInt256())
+ if (!Subtarget.hasInt256())
return SDValue();
// Use the register form of the broadcast instruction available on AVX2.
if (VT.getSizeInBits() >= 256)
- Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ Sc = extract128BitVector(Sc, 0, DAG, dl);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
}
@@ -5622,7 +5817,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// Constants may have multiple users.
// AVX-512 has register version of the broadcast
- bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+ bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
Ld.getValueType().getSizeInBits() >= 32;
if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
!hasRegVer))
@@ -5647,7 +5842,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// from the constant pool and not to broadcast it from a scalar.
// But override that restriction when optimizing for size.
// TODO: Check if splatting is recommended for other AVX-capable CPUs.
- if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
+ if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
@@ -5656,7 +5851,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
- (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
+ (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
@@ -5671,8 +5866,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
- false, false, Alignment);
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
@@ -5681,7 +5876,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Handle AVX2 in-register broadcasts.
- if (!IsLoad && Subtarget->hasInt256() &&
+ if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
@@ -5690,12 +5885,12 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
return SDValue();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
- (Subtarget->hasVLX() && ScalarSize == 64))
+ (Subtarget.hasVLX() && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
- if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
@@ -5801,7 +5996,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
- SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
unsigned Idx = InsertIndices[i];
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
@@ -5818,7 +6013,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
uint64_t Immediate = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
- if (In.getOpcode() != ISD::UNDEF)
+ if (!In.isUndef())
Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
}
SDLoc dl(Op);
@@ -5835,17 +6030,11 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
- if (ISD::isBuildVectorAllZeros(Op.getNode())) {
- SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
- SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
- }
+ if (ISD::isBuildVectorAllZeros(Op.getNode()))
+ return DAG.getTargetConstant(0, dl, VT);
- if (ISD::isBuildVectorAllOnes(Op.getNode())) {
- SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
- SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
- }
+ if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ return DAG.getTargetConstant(1, dl, VT);
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
@@ -5864,7 +6053,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
int SplatIdx = -1;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
- if (In.getOpcode() == ISD::UNDEF)
+ if (In.isUndef())
continue;
if (!isa<ConstantSDNode>(In))
NonConstIdx.push_back(idx);
@@ -5872,7 +6061,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
HasConstElts = true;
}
- if (SplatIdx == -1)
+ if (SplatIdx < 0)
SplatIdx = idx;
else if (In != Op.getOperand(SplatIdx))
IsSplat = false;
@@ -5903,7 +6092,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(0, dl));
}
- for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+ for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
Op.getOperand(InsertIdx),
@@ -5948,7 +6137,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
SDValue Op = N->getOperand(i + BaseIdx);
// Skip UNDEFs.
- if (Op->getOpcode() == ISD::UNDEF) {
+ if (Op->isUndef()) {
// Update the expected vector extract index.
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
@@ -5978,13 +6167,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
if (i * 2 < NumElts) {
- if (V0.getOpcode() == ISD::UNDEF) {
+ if (V0.isUndef()) {
V0 = Op0.getOperand(0);
if (V0.getValueType() != VT)
return false;
}
} else {
- if (V1.getOpcode() == ISD::UNDEF) {
+ if (V1.isUndef()) {
V1 = Op0.getOperand(0);
if (V1.getValueType() != VT)
return false;
@@ -6041,37 +6230,35 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
/// the upper 128-bits of the result.
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
- SDLoc DL, SelectionDAG &DAG,
+ const SDLoc &DL, SelectionDAG &DAG,
unsigned X86Opcode, bool Mode,
bool isUndefLO, bool isUndefHI) {
- EVT VT = V0.getValueType();
- assert(VT.is256BitVector() && VT == V1.getValueType() &&
+ MVT VT = V0.getSimpleValueType();
+ assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
"Invalid nodes in input!");
unsigned NumElts = VT.getVectorNumElements();
- SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
- SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
- SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
- SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
- EVT NewVT = V0_LO.getValueType();
+ SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
+ SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+ SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
+ SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+ MVT NewVT = V0_LO.getSimpleValueType();
SDValue LO = DAG.getUNDEF(NewVT);
SDValue HI = DAG.getUNDEF(NewVT);
if (Mode) {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
- if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+ if (!isUndefLO && !V0->isUndef())
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
- if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+ if (!isUndefHI && !V1->isUndef())
HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
} else {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
- if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
- V1_LO->getOpcode() != ISD::UNDEF))
+ if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
- if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
- V1_HI->getOpcode() != ISD::UNDEF))
+ if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
}
@@ -6081,10 +6268,10 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
/// node.
static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
MVT VT = BV->getSimpleValueType(0);
- if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
return SDValue();
SDLoc DL(BV);
@@ -6142,12 +6329,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
SubFound = true;
// Update InVec0 and InVec1.
- if (InVec0.getOpcode() == ISD::UNDEF) {
+ if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
return SDValue();
}
- if (InVec1.getOpcode() == ISD::UNDEF) {
+ if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
return SDValue();
@@ -6174,8 +6361,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
}
// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
- if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
- InVec1.getOpcode() != ISD::UNDEF)
+ if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
return SDValue();
@@ -6183,7 +6369,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = BV->getSimpleValueType(0);
unsigned NumElts = VT.getVectorNumElements();
@@ -6193,11 +6379,11 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
// Count the number of UNDEF operands in the build_vector in input.
for (unsigned i = 0, e = Half; i != e; ++i)
- if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ if (BV->getOperand(i)->isUndef())
NumUndefsLO++;
for (unsigned i = Half, e = NumElts; i != e; ++i)
- if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+ if (BV->getOperand(i)->isUndef())
NumUndefsHI++;
// Early exit if this is either a build_vector of all UNDEFs or all the
@@ -6207,14 +6393,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
SDLoc DL(BV);
SDValue InVec0, InVec1;
- if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+ if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
// Try to match an SSE3 float HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
- } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+ } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
// Try to match an SSSE3 integer HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
@@ -6223,7 +6409,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
}
- if (!Subtarget->hasAVX())
+ if (!Subtarget.hasAVX())
return SDValue();
if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
@@ -6232,18 +6418,14 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
SDValue InVec2, InVec3;
if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
- ((InVec0.getOpcode() == ISD::UNDEF ||
- InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
- ((InVec1.getOpcode() == ISD::UNDEF ||
- InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
- ((InVec0.getOpcode() == ISD::UNDEF ||
- InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
- ((InVec1.getOpcode() == ISD::UNDEF ||
- InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
} else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
// Try to match an AVX2 horizontal add/sub of signed integers.
@@ -6253,17 +6435,13 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
- ((InVec0.getOpcode() == ISD::UNDEF ||
- InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
- ((InVec1.getOpcode() == ISD::UNDEF ||
- InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
- ((InVec0.getOpcode() == ISD::UNDEF ||
- InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
- ((InVec1.getOpcode() == ISD::UNDEF ||
- InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+ ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+ ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HSUB;
else
CanFold = false;
@@ -6271,7 +6449,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
if (CanFold) {
// Fold this build_vector into a single horizontal add/sub.
// Do this only if the target has AVX2.
- if (Subtarget->hasAVX2())
+ if (Subtarget.hasAVX2())
return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
// Do not try to expand this build_vector into a pair of horizontal
@@ -6289,7 +6467,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
}
if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
- VT == MVT::v16i16) && Subtarget->hasAVX()) {
+ VT == MVT::v16i16) && Subtarget.hasAVX()) {
unsigned X86Opcode;
if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
X86Opcode = X86ISD::HADD;
@@ -6318,39 +6496,101 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
return SDValue();
}
-SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
-
+/// If a BUILD_VECTOR's source elements all apply the same bit operation and
+/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
+/// just apply the bit to the vectors.
+/// NOTE: Its not in our interest to start make a general purpose vectorizer
+/// from this, but enough scalar bit operations are created from the later
+/// legalization + scalarization stages to need basic support.
+static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
- MVT ExtVT = VT.getVectorElementType();
- unsigned NumElems = Op.getNumOperands();
+ unsigned NumElems = VT.getVectorNumElements();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // Generate vectors for predicate vectors.
- if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
- return LowerBUILD_VECTORvXi1(Op, DAG);
+ // Check that all elements have the same opcode.
+ // TODO: Should we allow UNDEFS and if so how many?
+ unsigned Opcode = Op.getOperand(0).getOpcode();
+ for (unsigned i = 1; i < NumElems; ++i)
+ if (Opcode != Op.getOperand(i).getOpcode())
+ return SDValue();
- // Vectors containing all zeros can be matched by pxor and xorps later
+ // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ switch (Opcode) {
+ default:
+ return SDValue();
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ if (!TLI.isOperationLegalOrPromote(Opcode, VT))
+ return SDValue();
+ break;
+ }
+
+ SmallVector<SDValue, 4> LHSElts, RHSElts;
+ for (SDValue Elt : Op->ops()) {
+ SDValue LHS = Elt.getOperand(0);
+ SDValue RHS = Elt.getOperand(1);
+
+ // We expect the canonicalized RHS operand to be the constant.
+ if (!isa<ConstantSDNode>(RHS))
+ return SDValue();
+ LHSElts.push_back(LHS);
+ RHSElts.push_back(RHS);
+ }
+
+ SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
+ SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+ return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+}
+
+/// Create a vector constant without a load. SSE/AVX provide the bare minimum
+/// functionality to do this, so it's all zeros, all ones, or some derivation
+/// that is cheap to calculate.
+static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Vectors containing all zeros can be matched by pxor and xorps.
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
- return getZeroVector(VT, Subtarget, DAG, dl);
+ return getZeroVector(VT, Subtarget, DAG, DL);
}
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
- if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
- if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+ if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()))
return Op;
- if (!VT.is512BitVector())
- return getOnesVector(VT, Subtarget, DAG, dl);
+ return getOnesVector(VT, Subtarget, DAG, DL);
}
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+
+ MVT VT = Op.getSimpleValueType();
+ MVT ExtVT = VT.getVectorElementType();
+ unsigned NumElems = Op.getNumOperands();
+
+ // Generate vectors for predicate vectors.
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG);
+
+ if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
+ return VectorConstant;
+
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
return AddSub;
@@ -6358,6 +6598,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return HorizontalOp;
if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
+ return BitOp;
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -6368,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SmallSet<SDValue, 8> Values;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
- if (Elt.getOpcode() == ISD::UNDEF)
+ if (Elt.isUndef())
continue;
Values.insert(Elt);
if (Elt.getOpcode() != ISD::Constant &&
@@ -6397,7 +6639,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// insertion that way. Only do this if the value is non-constant or if the
// value is a constant being inserted into element 0. It is cheaper to do
// a constant pool load than it is to do a movd + shuffle.
- if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+ if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
(!IsAllConstants || Idx == 0)) {
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
// Handle SSE only.
@@ -6422,7 +6664,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
- (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+ (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
if (VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
@@ -6439,16 +6681,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// it to i32 first.
if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- if (VT.is256BitVector()) {
- if (Subtarget->hasAVX()) {
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+ if (VT.getSizeInBits() >= 256) {
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ if (Subtarget.hasAVX()) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
} else {
// Without AVX, we need to extend to a 128-bit vector and then
// insert into the 256-bit vector.
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
- Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
+ Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
}
} else {
assert(VT.is128BitVector() && "Expected an SSE value type!");
@@ -6504,28 +6747,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
- // For AVX-length vectors, see if we can use a vector load to get all of the
- // elements, otherwise build the individual 128-bit pieces and use
+ // See if we can use a vector load to get all of the elements.
+ if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+ return LD;
+ }
+
+ // For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
- SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
-
- // Check for a build vector of consecutive loads.
- if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
- return LD;
+ SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
- SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
- makeArrayRef(&V[0], NumElems/2));
- SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
- makeArrayRef(&V[NumElems / 2], NumElems/2));
+ SDValue Lower =
+ DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
+ SDValue Upper = DAG.getBuildVector(
+ HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
// Recreate the wider vector with the lower and upper part.
if (VT.is256BitVector())
- return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
- return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+ return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
@@ -6557,30 +6802,30 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
- SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
+ SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1ULL << i));
if (isZero)
- V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+ Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
- V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
default: break;
case 0:
- V[i] = V[i*2]; // Must be a zero vector.
+ Ops[i] = Ops[i*2]; // Must be a zero vector.
break;
case 1:
- V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
break;
case 2:
- V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
case 3:
- V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
}
}
@@ -6593,32 +6838,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
static_cast<int>(Reverse2 ? NumElems : NumElems+1)
};
- return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+ return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
}
if (Values.size() > 1 && VT.is128BitVector()) {
- // Check for a build vector of consecutive loads.
- for (unsigned i = 0; i < NumElems; ++i)
- V[i] = Op.getOperand(i);
-
- // Check for elements which are consecutive loads.
- if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
- return LD;
-
// Check for a build vector from mostly shuffle plus few inserting.
if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
SDValue Result;
- if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+ if (!Op.getOperand(0).isUndef())
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
else
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
- if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+ if (Op.getOperand(i).isUndef()) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
@@ -6628,11 +6865,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
+ SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < NumElems; ++i) {
- if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
- V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ if (!Op.getOperand(i).isUndef())
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
else
- V[i] = DAG.getUNDEF(VT);
+ Ops[i] = DAG.getUNDEF(VT);
}
// Next, we iteratively mix elements, e.g. for v4f32:
@@ -6642,20 +6880,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned EltStride = NumElems >> 1;
while (EltStride != 0) {
for (unsigned i = 0; i < EltStride; ++i) {
- // If V[i+EltStride] is undef and this is the first round of mixing,
+ // If Ops[i+EltStride] is undef and this is the first round of mixing,
// then it is safe to just drop this shuffle: V[i] is already in the
// right place, the one element (since it's the first round) being
// inserted as undef can be dropped. This isn't safe for successive
// rounds because they will permute elements within both vectors.
- if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+ if (Ops[i+EltStride].isUndef() &&
EltStride == NumElems/2)
continue;
- V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
}
EltStride >>= 1;
}
- return V[0];
+ return Ops[0];
}
return SDValue();
}
@@ -6673,21 +6911,23 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
if (ResVT.is256BitVector())
- return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
if (Op.getNumOperands() == 4) {
MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
SDValue V3 = Op.getOperand(2);
SDValue V4 = Op.getOperand(3);
- return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
- Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+ return concat256BitVectors(
+ concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
+ concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
+ NumElems, DAG, dl);
}
- return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
@@ -6764,7 +7004,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
}
static SDValue LowerCONCAT_VECTORS(SDValue Op,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT.getVectorElementType() == MVT::i1)
@@ -6800,24 +7040,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
/// in-place shuffle are 'no-op's.
static bool isNoopShuffleMask(ArrayRef<int> Mask) {
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] != -1 && Mask[i] != i)
- return false;
- return true;
-}
-
-/// \brief Helper function to classify a mask as a single-input mask.
-///
-/// This isn't a generic single-input test because in the vector shuffle
-/// lowering we canonicalize single inputs to be the first input operand. This
-/// means we can more quickly test for a single input by only checking whether
-/// an input from the second operand exists. We also assume that the size of
-/// mask corresponds to the size of the input vectors which isn't true in the
-/// fully general case.
-static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
- for (int M : Mask)
- if (M >= (int)Mask.size())
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] >= 0 && Mask[i] != i)
return false;
+ }
return true;
}
@@ -6835,22 +7062,22 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return false;
}
-/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
-/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// lane-relative shuffle in each sub-lane. This trivially implies
/// that it is also not lane-crossing. It may however involve a blend from the
/// same lane of a second vector.
///
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
/// non-trivial to compute in the face of undef lanes. The representation is
-/// *not* suitable for use with existing 128-bit shuffles as it will contain
-/// entries from both V1 and V2 inputs to the wider mask.
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
- SmallVectorImpl<int> &RepeatedMask) {
- int LaneSize = 128 / VT.getScalarSizeInBits();
- RepeatedMask.resize(LaneSize, -1);
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
@@ -6860,17 +7087,55 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
- if (RepeatedMask[i % LaneSize] == -1)
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
+ : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] < 0)
// This is the first non-undef entry in this slot of a 128-bit lane.
- RepeatedMask[i % LaneSize] =
- Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
- else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
+/// Test whether a shuffle mask is equivalent within each 128-bit lane.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a shuffle mask is equivalent within each 256-bit lane.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
+}
+
+static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &ScaledMask) {
+ assert(0 < Scale && "Unexpected scaling factor");
+ int NumElts = Mask.size();
+ ScaledMask.assign(NumElts * Scale, -1);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+
+ // Repeat sentinel values in every mask element.
+ if (M < 0) {
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = M;
+ continue;
+ }
+
+ // Scale mask element and increment across each mask element.
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+ }
+}
+
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -6893,8 +7158,9 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
- for (int i = 0; i < Size; ++i)
- if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (!MaskBV || !ExpectedBV ||
@@ -6902,6 +7168,32 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
}
+}
+
+ return true;
+}
+
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ int Size = Mask.size();
+ if (Size != (int)ExpectedMask.size())
+ return false;
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+ return false;
+ else if (Mask[i] != ExpectedMask[i])
+ return false;
return true;
}
@@ -6914,8 +7206,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
/// example.
///
/// NB: We rely heavily on "undef" masks preserving the input lane.
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
- SelectionDAG &DAG) {
+static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
@@ -6923,11 +7214,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
unsigned Imm = 0;
- Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
- Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
- Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
- Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
- return DAG.getConstant(Imm, DL, MVT::i8);
+ Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
+ Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
+ Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
+ Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
+ return Imm;
+}
+
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+ SelectionDAG &DAG) {
+ return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
/// \brief Compute whether each element of a shuffle is zeroable.
@@ -6941,15 +7237,16 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2) {
SmallBitVector Zeroable(Mask.size(), false);
-
- while (V1.getOpcode() == ISD::BITCAST)
- V1 = V1->getOperand(0);
- while (V2.getOpcode() == ISD::BITCAST)
- V2 = V2->getOperand(0);
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ int VectorSizeInBits = V1.getValueType().getSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Mask.size();
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
// Handle the easy cases.
@@ -6958,38 +7255,119 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
continue;
}
- // If this is an index into a build_vector node (which has the same number
- // of elements), dig out the input value and use it.
+ // Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
continue;
- SDValue Input = V.getOperand(M % Size);
- // The UNDEF opcode check really should be dead code here, but not quite
- // worth asserting on (it isn't invalid, just unexpected).
- if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
- Zeroable[i] = true;
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef() || X86::isZeroNode(Op))
+ Zeroable[i] = true;
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllZeroable = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
+ }
+ Zeroable[i] = AllZeroable;
+ continue;
+ }
}
return Zeroable;
}
+/// Try to lower a shuffle with a single PSHUFB of V1.
+/// This is only possible if V2 is unused (at all, or only for zero elements).
+static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ const int NumBytes = VT.getSizeInBits() / 8;
+ const int NumEltBytes = VT.getScalarSizeInBits() / 8;
+
+ assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
+ (Subtarget.hasAVX2() && VT.is256BitVector()) ||
+ (Subtarget.hasBWI() && VT.is512BitVector()));
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
+ // Sign bit set in i8 mask means zero element.
+ SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / NumEltBytes];
+ if (M < 0) {
+ PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+ if (Zeroable[i / NumEltBytes]) {
+ PSHUFBMask[i] = ZeroMask;
+ continue;
+ }
+ // Only allow V1.
+ if (M >= Size)
+ return SDValue();
+
+ // PSHUFB can't cross lanes, ensure this doesn't happen.
+ if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
+ return SDValue();
+
+ M = M % LaneSize;
+ M = M * NumEltBytes + (i % NumEltBytes);
+ PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
+ }
+
+ MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
+ DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
+}
+
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
-static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
- SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();
int NumEltsInLane = 128 / VT.getScalarSizeInBits();
- SmallVector<int, 8> Unpckl;
- SmallVector<int, 8> Unpckh;
+ SmallVector<int, 8> Unpckl(NumElts);
+ SmallVector<int, 8> Unpckh(NumElts);
for (int i = 0; i < NumElts; ++i) {
unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
int HiPos = LoPos + NumEltsInLane / 2;
- Unpckl.push_back(LoPos);
- Unpckh.push_back(HiPos);
+ Unpckl[i] = LoPos;
+ Unpckh[i] = HiPos;
}
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
@@ -7013,7 +7391,7 @@ static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
MVT EltVT = VT.getVectorElementType();
@@ -7044,7 +7422,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
if (!V)
return SDValue(); // No non-zeroable elements!
- SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
V = DAG.getNode(VT.isFloatingPoint()
? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
DL, VT, V, VMask);
@@ -7056,7 +7434,7 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
-static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
@@ -7067,12 +7445,12 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+ if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
return SDValue(); // Shuffled input!
MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
}
- SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+ SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
// We have to cast V2 around.
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
@@ -7088,9 +7466,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
@@ -7153,13 +7531,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
case MVT::v4i64:
case MVT::v8i32:
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
// FALLTHROUGH
case MVT::v2i64:
case MVT::v4i32:
// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
// that instruction.
- if (Subtarget->hasAVX2()) {
+ if (Subtarget.hasAVX2()) {
// Scale the blend by the number of 32-bit dwords per element.
int Scale = VT.getScalarSizeInBits() / 32;
BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
@@ -7184,14 +7562,14 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
}
case MVT::v16i16: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
BlendMask = 0;
for (int i = 0; i < 8; ++i)
- if (RepeatedMask[i] >= 16)
+ if (RepeatedMask[i] >= 8)
BlendMask |= 1u << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
@@ -7200,7 +7578,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
// FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
- assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
+ assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
@@ -7235,10 +7613,9 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
- return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
- DAG.getNode(ISD::BUILD_VECTOR, DL,
- BlendVT, VSELECTMask),
- V1, V2));
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
}
default:
@@ -7251,8 +7628,8 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
-static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2,
+static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
// We build up the blend mask while checking whether a blend is a viable way
@@ -7266,7 +7643,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
- if (BlendMask[Mask[i] % Size] == -1)
+ if (BlendMask[Mask[i] % Size] < 0)
BlendMask[Mask[i] % Size] = Mask[i];
else if (BlendMask[Mask[i] % Size] != Mask[i])
return SDValue(); // Can't blend in the needed input!
@@ -7285,8 +7662,8 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
- SDValue V1,
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
+ MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
@@ -7335,10 +7712,10 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2,
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
ArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
@@ -7357,9 +7734,8 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue Lo, Hi;
for (int l = 0; l < NumElts; l += NumLaneElts) {
for (int i = 0; i < NumLaneElts; ++i) {
- if (Mask[l + i] == -1)
+ if (Mask[l + i] < 0)
continue;
- assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
// Get the mod-Size index and lane correct it.
int LaneIdx = (Mask[l + i] % NumElts) - l;
@@ -7411,19 +7787,22 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
else if (!Hi)
Hi = Lo;
+ // Cast the inputs to i8 vector of correct length to match PALIGNR or
+ // PSLLDQ/PSRLDQ.
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ Lo = DAG.getBitcast(ByteVT, Lo);
+ Hi = DAG.getBitcast(ByteVT, Hi);
+
// The actual rotate instruction rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector lane.
int Scale = 16 / NumLaneElts;
// SSSE3 targets can use the palignr instruction.
- if (Subtarget->hasSSSE3()) {
- // Cast the inputs to i8 vector of correct length to match PALIGNR.
- MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
- Lo = DAG.getBitcast(AlignVT, Lo);
- Hi = DAG.getBitcast(AlignVT, Hi);
-
+ if (Subtarget.hasSSSE3()) {
+ assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
+ "512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
}
@@ -7431,21 +7810,19 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
+ assert(ByteVT == MVT::v16i8 &&
+ "SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
int LoByteShift = 16 - Rotation * Scale;
int HiByteShift = Rotation * Scale;
- // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
- Lo = DAG.getBitcast(MVT::v2i64, Lo);
- Hi = DAG.getBitcast(MVT::v2i64, Hi);
-
- SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+ SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getConstant(LoByteShift, DL, MVT::i8));
- SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+ SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
DAG.getConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
- DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+ DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
@@ -7471,8 +7848,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
-static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
@@ -7510,7 +7888,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
// We need to round trip through the appropriate type for the shift.
MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
- MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
+ : MVT::getVectorVT(ShiftSVT, Size / Scale);
assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
@@ -7526,7 +7905,8 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
// their width within the elements of the larger integer vector. Test each
// multiple to see if we can find a match with the moved element indices
// and that the shifted in elements are all zeroable.
- for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
for (int Shift = 1; Shift != Scale; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Scale, Left))
@@ -7539,7 +7919,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
}
/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
@@ -7679,8 +8059,8 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
- ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+ ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
int EltBits = VT.getScalarSizeInBits();
int NumElements = VT.getVectorNumElements();
@@ -7713,14 +8093,20 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
// Not worth offseting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
if (Offset && Scale == 2 && VT.is128BitVector())
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+ InputV = ShuffleOffset(InputV);
+
+ // For 256-bit vectors, we only need the lower (128-bit) input half.
+ if (VT.is256BitVector())
+ InputV = extract128BitVector(InputV, 0, DAG, DL);
+
+ InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
return DAG.getBitcast(VT, InputV);
}
@@ -7752,33 +8138,33 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
// to 64-bits.
- if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+ if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
assert(VT.is128BitVector() && "Unexpected vector width!");
int LoIdx = Offset * EltBits;
- SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
- DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(LoIdx, DL, MVT::i8)));
+ SDValue Lo = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(LoIdx, DL, MVT::i8)));
if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
!SafeOffset(Offset + 1))
- return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+ return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
- SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
- DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(HiIdx, DL, MVT::i8)));
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+ SDValue Hi = DAG.getBitcast(
+ MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(HiIdx, DL, MVT::i8)));
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
// If this would require more than 2 unpack instructions to expand, use
// pshufb when available. We can only use more than 2 unpack instructions
// when zero extending i8 elements which also makes it easier to use pshufb.
- if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+ if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i) {
@@ -7787,10 +8173,9 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
}
InputV = DAG.getBitcast(MVT::v16i8, InputV);
- return DAG.getBitcast(VT,
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
- DAG.getNode(ISD::BUILD_VECTOR, DL,
- MVT::v16i8, PSHUFBMask)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
}
// If we are extending from an offset, ensure we start on a boundary that
@@ -7837,8 +8222,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
- SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
@@ -7858,7 +8243,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
int Matches = 0;
for (int i = 0; i < NumElements; ++i) {
int M = Mask[i];
- if (M == -1)
+ if (M < 0)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
// Each of the extended elements need to be zeroable.
@@ -7960,8 +8345,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
SelectionDAG &DAG) {
MVT VT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
- while (V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
+ V = peekThroughBitcasts(V);
+
// If the bitcasts shift the element size, we can't extract an equivalent
// element from it.
MVT NewVT = V.getSimpleValueType();
@@ -7974,7 +8359,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
// FIXME: Add support for scalar truncation where possible.
SDValue S = V.getOperand(Idx);
if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
- return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
+ return DAG.getBitcast(EltVT, S);
}
return SDValue();
@@ -7985,9 +8370,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
static bool isShuffleFoldableLoad(SDValue V) {
- while (V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
-
+ V = peekThroughBitcasts(V);
return ISD::isNON_EXTLoad(V.getNode());
}
@@ -7996,8 +8379,8 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
- SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
@@ -8054,7 +8437,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
// This is essentially a special case blend operation, but if we have
// general purpose blend operations, they are always faster. Bail and let
// the rest of the lowering handle these as blends.
- if (Subtarget->hasSSE41())
+ if (Subtarget.hasSSE41())
return SDValue();
// Otherwise, use MOVSD or MOVSS.
@@ -8082,9 +8465,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
- V2 = DAG.getBitcast(MVT::v2i64, V2);
+ V2 = DAG.getBitcast(MVT::v16i8, V2);
V2 = DAG.getNode(
- X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+ X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
DAG.getDataLayout(), VT)));
@@ -8094,15 +8477,15 @@ static SDValue lowerVectorShuffleAsElementInsertion(
return V2;
}
-/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// Try to lower broadcast of a single - truncated - integer element,
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
///
/// This assumes we have AVX2.
-static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
- int BroadcastIdx,
- const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
+ SDValue V0, int BroadcastIdx,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget->hasAVX2() &&
+ assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
EVT EltVT = VT.getVectorElementType();
@@ -8153,38 +8536,57 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
-static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
ArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget->hasAVX())
- return SDValue();
- if (VT.isInteger() && !Subtarget->hasAVX2())
+ if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
+ (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
+ (Subtarget.hasAVX2() && VT.isInteger())))
return SDValue();
+ // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
+ // we can only broadcast from a register with AVX2.
+ unsigned NumElts = Mask.size();
+ unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+ bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+
// Check that the mask is a broadcast.
int BroadcastIdx = -1;
- for (int M : Mask)
- if (M >= 0 && BroadcastIdx == -1)
- BroadcastIdx = M;
- else if (M >= 0 && M != BroadcastIdx)
- return SDValue();
+ for (int i = 0; i != (int)NumElts; ++i) {
+ SmallVector<int, 8> BroadcastMask(NumElts, i);
+ if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
+ BroadcastIdx = i;
+ break;
+ }
+ }
+ if (BroadcastIdx < 0)
+ return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
+ case ISD::BITCAST: {
+ SDValue VSrc = V.getOperand(0);
+ MVT SrcVT = VSrc.getSimpleValueType();
+ if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
+ break;
+ V = VSrc;
+ continue;
+ }
case ISD::CONCAT_VECTORS: {
int OperandSize = Mask.size() / V.getNumOperands();
V = V.getOperand(BroadcastIdx / OperandSize);
BroadcastIdx %= OperandSize;
continue;
}
-
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
@@ -8219,45 +8621,76 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
MVT BroadcastVT = VT;
// Peek through any bitcast (only useful for loads).
- SDValue BC = V;
- while (BC.getOpcode() == ISD::BITCAST)
- BC = BC.getOperand(0);
+ SDValue BC = peekThroughBitcasts(V);
// Also check the simpler case, where we can directly reuse the scalar.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
- // If the scalar isn't a load, we can't broadcast from it in AVX1.
- // Only AVX2 has register broadcasts.
- if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+ // If we can't broadcast from a register, check that the input is a load.
+ if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64)
+ if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+ Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+ }
// If we are broadcasting a load that is only used by the shuffle
// then we can reduce the vector load to the broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(BC);
SDValue BaseAddr = Ld->getOperand(1);
- EVT AddrVT = BaseAddr.getValueType();
EVT SVT = BroadcastVT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
- SDValue NewAddr = DAG.getNode(
- ISD::ADD, DL, AddrVT, BaseAddr,
- DAG.getConstant(Offset, DL, AddrVT));
+ SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
- } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
- // We can't broadcast from a vector register without AVX2, and we can only
- // broadcast from the zero-element of a vector register.
+ } else if (!BroadcastFromReg) {
+ // We can't broadcast from a vector register.
return SDValue();
+ } else if (BroadcastIdx != 0) {
+ // We can only broadcast from the zero-element of a vector register,
+ // but it can be advantageous to broadcast from the zero-element of a
+ // subvector.
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64)
+ return SDValue();
+
+ // Only broadcast the zero-element of a 128-bit subvector.
+ unsigned EltSize = VT.getScalarSizeInBits();
+ if (((BroadcastIdx * EltSize) % 128) != 0)
+ return SDValue();
+
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
+ DAG.getIntPtrConstant(BroadcastIdx, DL));
}
- V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V);
- return DAG.getBitcast(VT, V);
+ if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+ DAG.getBitcast(MVT::f64, V));
+
+ // Bitcast back to the same scalar type as BroadcastVT.
+ MVT SrcVT = V.getSimpleValueType();
+ if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
+ assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ "Unexpected vector element size");
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+ } else {
+ SrcVT = BroadcastVT.getScalarType();
+ }
+ V = DAG.getBitcast(SrcVT, V);
+ }
+
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
// Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -8266,16 +8699,14 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
- assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
- assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
+ assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
unsigned ZMask = 0;
int V1DstIndex = -1;
int V2DstIndex = -1;
@@ -8295,8 +8726,8 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
}
// We can only insert a single non-zeroable element.
- if (V1DstIndex != -1 || V2DstIndex != -1)
- return SDValue();
+ if (V1DstIndex >= 0 || V2DstIndex >= 0)
+ return false;
if (Mask[i] < 4) {
// V1 input out of place for insertion.
@@ -8308,13 +8739,13 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
}
// Don't bother if we have no (non-zeroable) element for insertion.
- if (V1DstIndex == -1 && V2DstIndex == -1)
- return SDValue();
+ if (V1DstIndex < 0 && V2DstIndex < 0)
+ return false;
// Determine element insertion src/dst indices. The src index is from the
// start of the inserted vector, not the start of the concatenated vector.
unsigned V2SrcIndex = 0;
- if (V1DstIndex != -1) {
+ if (V1DstIndex >= 0) {
// If we have a V1 input out of place, we use V1 as the V2 element insertion
// and don't use the original V2 at all.
V2SrcIndex = Mask[V1DstIndex];
@@ -8329,11 +8760,25 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
if (!V1UsedInPlace)
V1 = DAG.getUNDEF(MVT::v4f32);
- unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ // Insert the V2 element into the desired position.
+ InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ // Attempt to match the insertps pattern.
+ unsigned InsertPSMask;
+ if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ return SDValue();
// Insert the V2 element into the desired position.
- SDLoc DL(Op);
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
@@ -8347,29 +8792,30 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
"This routine only supports integer vectors.");
- assert(!isSingleInputShuffleMask(Mask) &&
+ assert(VT.is128BitVector() &&
+ "This routine only works on 128-bit vectors.");
+ assert(!V2.isUndef() &&
"This routine should only be used when blending two inputs.");
assert(Mask.size() >= 2 && "Single element masks are invalid.");
int Size = Mask.size();
- int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
- return M >= 0 && M % Size < Size / 2;
- });
- int NumHiInputs = std::count_if(
- Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+ int NumLoInputs =
+ count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+ int NumHiInputs =
+ count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
bool UnpackLo = NumLoInputs >= NumHiInputs;
- auto TryUnpack = [&](MVT UnpackVT, int Scale) {
- SmallVector<int, 32> V1Mask(Mask.size(), -1);
- SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ auto TryUnpack = [&](int ScalarSize, int Scale) {
+ SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+ SmallVector<int, 16> V2Mask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
@@ -8401,6 +8847,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
// Cast the inputs to the type we will use to unpack them.
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
V1 = DAG.getBitcast(UnpackVT, V1);
V2 = DAG.getBitcast(UnpackVT, V2);
@@ -8412,15 +8859,10 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
// We try each unpack from the largest to the smallest to try and find one
// that fits this mask.
- int OrigNumElements = VT.getVectorNumElements();
int OrigScalarSize = VT.getScalarSizeInBits();
- for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
- int Scale = ScalarSize / OrigScalarSize;
- int NumElements = OrigNumElements / Scale;
- MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
- if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+ if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
return Unpack;
- }
// If none of the unpack-rooted lowerings worked (or were profitable) try an
// initial unpack.
@@ -8434,8 +8876,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
// half-crossings are created.
// FIXME: We could consider commuting the unpacks.
- SmallVector<int, 32> PermMask;
- PermMask.assign(Size, -1);
+ SmallVector<int, 32> PermMask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
@@ -8461,28 +8902,25 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
-static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
- if (isSingleInputShuffleMask(Mask)) {
- // Use low duplicate instructions for masks that match their pattern.
- if (Subtarget->hasSSE3())
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
- return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+ if (V2.isUndef()) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
- if (Subtarget->hasAVX()) {
+ if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
@@ -8521,7 +8959,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v2f64, V2,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
- if (Subtarget->hasSSE41())
+ if (Subtarget.hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
@@ -8542,21 +8980,18 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
-static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
- if (isSingleInputShuffleMask(Mask)) {
+ if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -8576,28 +9011,29 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
- // If we have a blend of two PACKUS operations an the blend aligns with the
- // low and half halves, we can just merge the PACKUS operations. This is
- // particularly important as it lets us merge shuffles that this routine itself
- // creates.
+ // If we have a blend of two same-type PACKUS operations and the blend aligns
+ // with the low and high halves, we can just merge the PACKUS operations.
+ // This is particularly important as it lets us merge shuffles that this
+ // routine itself creates.
auto GetPackNode = [](SDValue V) {
- while (V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
-
+ V = peekThroughBitcasts(V);
return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
};
if (SDValue V1Pack = GetPackNode(V1))
- if (SDValue V2Pack = GetPackNode(V2))
- return DAG.getBitcast(MVT::v2i64,
- DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
- Mask[0] == 0 ? V1Pack.getOperand(0)
- : V1Pack.getOperand(1),
- Mask[1] == 2 ? V2Pack.getOperand(0)
- : V2Pack.getOperand(1)));
+ if (SDValue V2Pack = GetPackNode(V2)) {
+ EVT PackVT = V1Pack.getValueType();
+ if (PackVT == V2Pack.getValueType())
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+ }
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
@@ -8614,7 +9050,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
- bool IsBlendSupported = Subtarget->hasSSE41();
+ bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
@@ -8627,7 +9063,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget->hasSSSE3())
+ if (Subtarget.hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
@@ -8655,12 +9091,16 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
// This routine only handles 128-bit shufps.
assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
// To lower with a single SHUFPS we need to have the low half and high half
// each requiring a single input.
- if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
return false;
- if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
return false;
return true;
@@ -8671,14 +9111,13 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
-static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
int V2Index =
@@ -8689,7 +9128,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
// the low bit.
int V2AdjIndex = V2Index ^ 1;
- if (Mask[V2AdjIndex] == -1) {
+ if (Mask[V2AdjIndex] < 0) {
// Handles all the cases where we have a single V2 element and an undef.
// This will only ever happen in the high lanes because we commute the
// vector otherwise.
@@ -8761,35 +9200,31 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
-static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
- if (Subtarget->hasSSE3()) {
+ if (Subtarget.hasSSE3()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
}
- if (Subtarget->hasAVX()) {
+ if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
@@ -8812,13 +9247,13 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
- if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
@@ -8827,6 +9262,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return BlendPerm;
}
+ // Use low/high mov instructions.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
@@ -8840,15 +9281,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
-static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
@@ -8858,13 +9296,12 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return ZExt;
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -8884,8 +9321,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
@@ -8896,7 +9333,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
- bool IsBlendSupported = Subtarget->hasSSE41();
+ bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
@@ -8913,7 +9350,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget->hasSSSE3())
+ if (Subtarget.hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
@@ -8957,8 +9394,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
- SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
@@ -8987,6 +9424,26 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+ // If we are splatting two values from one half - one to each half, then
+ // we can shuffle that half so each is splatted to a dword, then splat those
+ // to their respective halves.
+ auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
+ int DOffset) {
+ int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
+ int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
+ V = DAG.getNode(ShufWOp, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+ V = DAG.getBitcast(PSHUFDVT, V);
+ V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+ return DAG.getBitcast(VT, V);
+ };
+
+ if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
+ return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
+ if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
+ return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
+
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
@@ -9096,9 +9553,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
- if (M != -1 && M == FixIdx)
+ if (M >= 0 && M == FixIdx)
M = FixFreeIdx;
- else if (M != -1 && M == FixFreeIdx)
+ else if (M >= 0 && M == FixFreeIdx)
M = FixIdx;
};
if (NumFlippedBToBInputs != 0) {
@@ -9123,9 +9580,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Adjust the mask to match the new locations of A and B.
for (int &M : Mask)
- if (M != -1 && M/2 == ADWord)
+ if (M >= 0 && M/2 == ADWord)
M = 2 * BDWord + M % 2;
- else if (M != -1 && M/2 == BDWord)
+ else if (M >= 0 && M/2 == BDWord)
M = 2 * ADWord + M % 2;
// Recurse back into this routine to re-compute state now that this isn't
@@ -9194,7 +9651,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
int DestOffset) {
auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
- return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+ return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
};
auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
int Word) {
@@ -9213,7 +9670,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// If the source half mask maps over the inputs, turn those into
// swaps and use the swapped lane.
if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
- if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+ if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
Input - SourceOffset;
// We have to swap the uses in our half mask in one sweep.
@@ -9234,7 +9691,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
}
// Map the input's dword into the correct half.
- if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+ if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
else
assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
@@ -9280,17 +9737,17 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// the inputs, place the other input in it. We use (Index XOR 1) to
// compute an adjacent index.
if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
- SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
InputsFixed[1] = InputsFixed[0] ^ 1;
} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
- SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+ SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
InputsFixed[0] = InputsFixed[1] ^ 1;
- } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
- SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+ } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
+ SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
// The two inputs are in the same DWord but it is clobbered and the
// adjacent DWord isn't used at all. Move both inputs to the free
// slot.
@@ -9304,7 +9761,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// free slot adjacent to one of the inputs. In this case, we have to
// swap an input with a non-input.
for (int i = 0; i < 4; ++i)
- assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+ assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!");
assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
"Cannot have adjacent inputs here!");
@@ -9338,8 +9795,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
}
// Now hoist the DWord down to the right half.
- int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
- assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+ int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
+ assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
for (int &M : HalfMask)
for (int Input : IncomingInputs)
@@ -9367,11 +9824,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// At this point, each half should contain all its inputs, and we can then
// just shuffle them into their final position.
- assert(std::count_if(LoMask.begin(), LoMask.end(),
- [](int M) { return M >= 4; }) == 0 &&
+ assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!");
- assert(std::count_if(HiMask.begin(), HiMask.end(),
- [](int M) { return M >= 0 && M < 4; }) == 0 &&
+ assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
"Failed to lift all the low half inputs to the high mask!");
// Do a half shuffle for the low mask.
@@ -9390,11 +9845,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
return V;
}
-/// \brief Helper to form a PSHUFB-based shuffle+blend.
-static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG, bool &V1InUse,
- bool &V2InUse) {
+/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
+/// blend if only one input is used.
+static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
SDValue V1Mask[16];
SDValue V2Mask[16];
@@ -9404,7 +9859,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
int Size = Mask.size();
int Scale = 16 / Size;
for (int i = 0; i < 16; ++i) {
- if (Mask[i / Scale] == -1) {
+ if (Mask[i / Scale] < 0) {
V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
const int ZeroMask = 0x80;
@@ -9425,11 +9880,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
if (V1InUse)
V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
DAG.getBitcast(MVT::v16i8, V1),
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
if (V2InUse)
V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
DAG.getBitcast(MVT::v16i8, V2),
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+ DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
// If we need shuffled inputs from both, blend the two.
SDValue V;
@@ -9454,42 +9909,31 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
-static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> OrigMask = SVOp->getMask();
- int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
- OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
- MutableArrayRef<int> Mask(MaskStorage);
-
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return ZExt;
- auto isV1 = [](int M) { return M >= 0 && M < 8; };
- (void)isV1;
- auto isV2 = [](int M) { return M >= 8; };
-
- int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+ int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -9502,21 +9946,24 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return Rotate;
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
- Subtarget, DAG);
+ // Make a copy of the mask so it can be modified.
+ SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
+ MutableMask, Subtarget,
+ DAG);
}
- assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+ assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
"All single-input shuffles should be canonicalized to be V1-input "
"shuffles.");
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
- if (Subtarget->hasSSE4A())
+ if (Subtarget.hasSSE4A())
if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
return V;
@@ -9528,7 +9975,7 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
- bool IsBlendSupported = Subtarget->hasSSE41();
+ bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
@@ -9552,16 +9999,17 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
+ // Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
V2, Mask, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
- if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+ if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
- V1InUse, V2InUse);
+ return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
@@ -9591,10 +10039,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// \returns N above, or the number of times even elements must be dropped if
/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
- // Figure out whether we're looping over two inputs or just one.
- bool IsSingleInput = isSingleInputShuffleMask(Mask);
-
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+ bool IsSingleInput) {
// The modulus for the shuffle vector entries is based on whether this is
// a single input or not.
int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
@@ -9611,7 +10057,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
for (int i = 0, e = Mask.size(); i < e; ++i) {
// Ignore undef lanes, we'll optimistically collapse them to the pattern we
// want.
- if (Mask[i] == -1)
+ if (Mask[i] < 0)
continue;
bool IsAnyViable = false;
@@ -9645,20 +10091,17 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
-static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -9672,18 +10115,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
- if (Subtarget->hasSSE4A())
+ if (Subtarget.hasSSE4A())
if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
return V;
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+ int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
@@ -9696,7 +10138,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// i16 shuffle as well.
auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
for (int i = 0; i < 16; i += 2)
- if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
+ if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
return false;
return true;
@@ -9734,7 +10176,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
// If we haven't yet mapped the input, search for a slot into which
// we can map it.
- while (j < je && PreDupI16Shuffle[j] != -1)
+ while (j < je && PreDupI16Shuffle[j] >= 0)
++j;
if (j == je)
@@ -9759,10 +10201,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; ++i)
- if (Mask[i] != -1) {
+ if (Mask[i] >= 0) {
int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
- if (PostDupI16Shuffle[i / 2] == -1)
+ if (PostDupI16Shuffle[i / 2] < 0)
PostDupI16Shuffle[i / 2] = MappedMask;
else
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
@@ -9799,18 +10241,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// FIXME: The only exceptions to the above are blends which are exact
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget.hasSSSE3()) {
bool V1InUse = false;
bool V2InUse = false;
- SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
- DAG, V1InUse, V2InUse);
+ SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+ DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
// do so. This avoids using them to handle blends-with-zero which is
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
- if (Subtarget->hasSSE41())
+ if (Subtarget.hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return Blend;
@@ -9848,11 +10290,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// We special case these as they can be particularly efficiently handled with
// the PACKUSB instruction on x86 and they show up in common patterns of
// rearranging bytes to truncate wide elements.
- if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+ bool IsSingleInput = V2.isUndef();
+ if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
// NumEvenDrops is the power of two stride of the elements. Another way of
// thinking about it is that we need to drop the even elements this many
// times to get the original input.
- bool IsSingleInput = isSingleInputShuffleMask(Mask);
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
@@ -9907,7 +10349,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Use a mask to drop the high bytes.
VLoHalf = DAG.getBitcast(MVT::v8i16, V);
VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
- DAG.getConstant(0x00FF, DL, MVT::v8i16));
+ DAG.getConstant(0x00FF, DL, MVT::v8i16));
// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
VHiHalf = DAG.getUNDEF(MVT::v8i16);
@@ -9938,22 +10380,23 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
- return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v2f64:
- return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v4i32:
- return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v4f32:
- return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v8i16:
- return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v16i8:
- return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
@@ -9971,21 +10414,22 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// a zero-ed lane of a vector.
static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
+ WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
// If both elements are undef, its trivial.
if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
- WidenedMask.push_back(SM_SentinelUndef);
+ WidenedMask[i/2] = SM_SentinelUndef;
continue;
}
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
- WidenedMask.push_back(Mask[i + 1] / 2);
+ WidenedMask[i/2] = Mask[i + 1] / 2;
continue;
}
if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
- WidenedMask.push_back(Mask[i] / 2);
+ WidenedMask[i/2] = Mask[i] / 2;
continue;
}
@@ -9993,7 +10437,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
(Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
- WidenedMask.push_back(SM_SentinelZero);
+ WidenedMask[i/2] = SM_SentinelZero;
continue;
}
return false;
@@ -10002,7 +10446,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
// Finally check if the two mask values are adjacent and aligned with
// a pair.
if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
- WidenedMask.push_back(Mask[i] / 2);
+ WidenedMask[i/2] = Mask[i] / 2;
continue;
}
@@ -10020,7 +10464,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
-static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
@@ -10039,8 +10483,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
// Rather than splitting build-vectors, just build two narrower build
// vectors. This helps shuffling with splats and zeros.
auto SplitVector = [&](SDValue V) {
- while (V.getOpcode() == ISD::BITCAST)
- V = V->getOperand(0);
+ V = peekThroughBitcasts(V);
MVT OrigVT = V.getSimpleValueType();
int OrigNumElements = OrigVT.getVectorNumElements();
@@ -10063,8 +10506,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
LoOps.push_back(BV->getOperand(i));
HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
}
- LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
- HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+ LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
+ HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
}
return std::make_pair(DAG.getBitcast(SplitVT, LoV),
DAG.getBitcast(SplitVT, HiV));
@@ -10077,7 +10520,9 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
- SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
@@ -10085,21 +10530,15 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
UseHiV2 = true;
else
UseLoV2 = true;
- V2BlendMask.push_back(M - NumElements);
- V1BlendMask.push_back(-1);
- BlendMask.push_back(SplitNumElements + i);
+ V2BlendMask[i] = M - NumElements;
+ BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
else
UseLoV1 = true;
- V2BlendMask.push_back(-1);
- V1BlendMask.push_back(M);
- BlendMask.push_back(i);
- } else {
- V2BlendMask.push_back(-1);
- V1BlendMask.push_back(-1);
- BlendMask.push_back(-1);
+ V1BlendMask[i] = M;
+ BlendMask[i] = i;
}
}
@@ -10151,12 +10590,12 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
-static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
+static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
SelectionDAG &DAG) {
- assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
- "lower single-input shuffles as it "
- "could then recurse on itself.");
+ assert(!V2.isUndef() && "This routine must not be used to lower single-input "
+ "shuffles as it could then recurse on itself.");
int Size = Mask.size();
// If this can be modeled as a broadcast of two elements followed by a blend,
@@ -10166,12 +10605,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
for (int M : Mask)
if (M >= Size) {
- if (V2BroadcastIdx == -1)
+ if (V2BroadcastIdx < 0)
V2BroadcastIdx = M - Size;
else if (M - Size != V2BroadcastIdx)
return false;
} else if (M >= 0) {
- if (V1BroadcastIdx == -1)
+ if (V1BroadcastIdx < 0)
V1BroadcastIdx = M;
else if (M != V1BroadcastIdx)
return false;
@@ -10210,54 +10649,51 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.
-static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
- int LaneSize = Mask.size() / 2;
+ int Size = Mask.size();
+ int LaneSize = Size / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
bool LaneCrossing[2] = {false, false};
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
- if (isSingleInputShuffleMask(Mask)) {
- SmallVector<int, 32> FlippedBlendMask;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- FlippedBlendMask.push_back(
- Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
- ? Mask[i]
- : Mask[i] % LaneSize +
- (i / LaneSize) * LaneSize + Size));
-
- // Flip the vector, and blend the results which should now be in-lane. The
- // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
- // 5 for the high source. The value 3 selects the high half of source 2 and
- // the value 2 selects the low half of source 2. We only use source 2 to
- // allow folding it into a memory operand.
- unsigned PERMMask = 3 | 2 << 4;
- SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
- V1, DAG.getConstant(PERMMask, DL, MVT::i8));
- return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
- }
-
- // This now reduces to two single-input shuffles of V1 and V2 which at worst
- // will be handled by the above logic and a blend of the results, much like
- // other patterns in AVX.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+ assert(V2.isUndef() &&
+ "This last part of this routine only works on single input shuffles");
+
+ SmallVector<int, 32> FlippedBlendMask(Size);
+ for (int i = 0; i < Size; ++i)
+ FlippedBlendMask[i] =
+ Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+ ? Mask[i]
+ : Mask[i] % LaneSize +
+ (i / LaneSize) * LaneSize + Size);
+
+ // Flip the vector, and blend the results which should now be in-lane. The
+ // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+ // 5 for the high source. The value 3 selects the high half of source 2 and
+ // the value 2 selects the low half of source 2. We only use source 2 to
+ // allow folding it into a memory operand.
+ unsigned PERMMask = 3 | 2 << 4;
+ SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+ V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
}
/// \brief Handle lowering 2-lane 128-bit shuffles.
-static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
@@ -10278,6 +10714,10 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@@ -10349,10 +10789,9 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
/// in x86 only floating point has interesting non-repeating shuffles, and even
/// those are still *marginally* more expensive.
static SDValue lowerVectorShuffleByMerging128BitLanes(
- SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(!isSingleInputShuffleMask(Mask) &&
- "This is only useful with multiple inputs.");
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!V2.isUndef() && "This is only useful with multiple inputs.");
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
@@ -10361,10 +10800,8 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
// check whether the in-128-bit lane shuffles share a repeating pattern.
- SmallVector<int, 4> Lanes;
- Lanes.resize(NumLanes, -1);
- SmallVector<int, 4> InLaneMask;
- InLaneMask.resize(LaneSize, -1);
+ SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
+ SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
@@ -10392,8 +10829,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
// First shuffle the lanes into place.
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
VT.getSizeInBits() / 64);
- SmallVector<int, 8> LaneMask;
- LaneMask.resize(NumLanes * 2, -1);
+ SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
for (int i = 0; i < NumLanes; ++i)
if (Lanes[i] >= 0) {
LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
@@ -10408,8 +10844,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
// Now do a simple shuffle that isn't lane crossing.
- SmallVector<int, 8> NewMask;
- NewMask.resize(Size, -1);
+ SmallVector<int, 8> NewMask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
@@ -10422,11 +10857,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
/// This allows for fast cases such as subvector extraction/insertion
/// or shuffling smaller vector types which can lower more efficiently.
-static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+ assert(VT.is256BitVector() && "Expected 256-bit vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
@@ -10457,21 +10893,16 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
DAG.getIntPtrConstant(HalfNumElts, DL));
}
- // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
- if (UndefLower && Subtarget->hasAVX2() &&
- (VT == MVT::v4f64 || VT == MVT::v4i64))
- return SDValue();
-
- // If the shuffle only uses the lower halves of the input operands,
+ // If the shuffle only uses two of the four halves of the input operands,
// then extract them and perform the 'half' shuffle at half width.
// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
int HalfIdx1 = -1, HalfIdx2 = -1;
- SmallVector<int, 8> HalfMask;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
unsigned Offset = UndefLower ? HalfNumElts : 0;
for (unsigned i = 0; i != HalfNumElts; ++i) {
int M = Mask[i + Offset];
if (M < 0) {
- HalfMask.push_back(M);
+ HalfMask[i] = M;
continue;
}
@@ -10479,23 +10910,18 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
int HalfIdx = M / HalfNumElts;
- // Only shuffle using the lower halves of the inputs.
- // TODO: Investigate usefulness of shuffling with upper halves.
- if (HalfIdx != 0 && HalfIdx != 2)
- return SDValue();
-
// Determine the element index into its half vector source.
int HalfElt = M % HalfNumElts;
// We can shuffle with up to 2 half vectors, set the new 'half'
// shuffle mask accordingly.
- if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
- HalfMask.push_back(HalfElt);
+ if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
+ HalfMask[i] = HalfElt;
HalfIdx1 = HalfIdx;
continue;
}
- if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
- HalfMask.push_back(HalfElt + HalfNumElts);
+ if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
+ HalfMask[i] = HalfElt + HalfNumElts;
HalfIdx2 = HalfIdx;
continue;
}
@@ -10505,6 +10931,33 @@ static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
}
assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+ // Only shuffle the halves of the inputs when useful.
+ int NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ int NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+
+ // uuuuXXXX - don't extract uppers just to insert again.
+ if (UndefLower && NumUpperHalves != 0)
+ return SDValue();
+
+ // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
+ if (UndefUpper && NumUpperHalves == 2)
+ return SDValue();
+
+ // AVX2 - XXXXuuuu - always extract lowers.
+ if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64)
+ return SDValue();
+ // AVX2 supports variable 32-bit element cross-lane shuffles.
+ if (VT == MVT::v8f32 || VT == MVT::v8i32) {
+ // XXXXuuuu - don't extract lowers and uppers.
+ if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
+ return SDValue();
+ }
+ }
+
auto GetHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
@@ -10536,7 +10989,177 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
return true;
}
-static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+/// Handle case where shuffle sources are coming from the same 128-bit lane and
+/// every lane can be represented as the same repeating mask - allowing us to
+/// shuffle the sources with the repeating shuffle and then permute the result
+/// to the destination lanes.
+static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
+ // On AVX2 we may be able to just shuffle the lowest elements and then
+ // broadcast the result.
+ if (Subtarget.hasAVX2()) {
+ for (unsigned BroadcastSize : {16, 32, 64}) {
+ if (BroadcastSize <= VT.getScalarSizeInBits())
+ continue;
+ int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
+
+ // Attempt to match a repeating pattern every NumBroadcastElts,
+ // accounting for UNDEFs but only references the lowest 128-bit
+ // lane of the inputs.
+ auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ int &R = RepeatMask[j];
+ if (0 != ((M % NumElts) / NumLaneElts))
+ return false;
+ if (0 <= R && R != M)
+ return false;
+ R = M;
+ }
+ return true;
+ };
+
+ SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
+ if (!FindRepeatingBroadcastMask(RepeatMask))
+ continue;
+
+ // Shuffle the (lowest) repeated elements in place for broadcast.
+ SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
+
+ // Shuffle the actual broadcast.
+ SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumBroadcastElts)
+ for (int j = 0; j != NumBroadcastElts; ++j)
+ BroadcastMask[i + j] = j;
+ return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
+ BroadcastMask);
+ }
+ }
+
+ // Bail if the shuffle mask doesn't cross 128-bit lanes.
+ if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+ return SDValue();
+
+ // Bail if we already have a repeated lane shuffle mask.
+ SmallVector<int, 8> RepeatedShuffleMask;
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+ return SDValue();
+
+ // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+ // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
+ int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
+ int NumSubLanes = NumLanes * SubLaneScale;
+ int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+ // Check that all the sources are coming from the same lane and see if we can
+ // form a repeating shuffle mask (local to each sub-lane). At the same time,
+ // determine the source sub-lane for each destination sub-lane.
+ int TopSrcSubLane = -1;
+ SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+ SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+
+ for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+ // Extract the sub-lane mask, check that it all comes from the same lane
+ // and normalize the mask entries to come from the first lane.
+ int SrcLane = -1;
+ SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumLaneElts;
+ if ((0 <= SrcLane) && (SrcLane != Lane))
+ return SDValue();
+ SrcLane = Lane;
+ int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+ SubLaneMask[Elt] = LocalM;
+ }
+
+ // Whole sub-lane is UNDEF.
+ if (SrcLane < 0)
+ continue;
+
+ // Attempt to match against the candidate repeated sub-lane masks.
+ for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+ auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ if (M1[i] < 0 || M2[i] < 0)
+ continue;
+ if (M1[i] != M2[i])
+ return false;
+ }
+ return true;
+ };
+
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+ if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+ continue;
+
+ // Merge the sub-lane mask into the matching repeated sub-lane mask.
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ int M = SubLaneMask[i];
+ if (M < 0)
+ continue;
+ assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+ "Unexpected mask element");
+ RepeatedSubLaneMask[i] = M;
+ }
+
+ // Track the top most source sub-lane - by setting the remaining to UNDEF
+ // we can greatly simplify shuffle matching.
+ int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+ TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+ Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+ break;
+ }
+
+ // Bail if we failed to find a matching repeated sub-lane mask.
+ if (Dst2SrcSubLanes[DstSubLane] < 0)
+ return SDValue();
+ }
+ assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+ "Unexpected source lane");
+
+ // Create a repeating shuffle mask for the entire vector.
+ SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+ for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+ int Lane = SubLane / SubLaneScale;
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = RepeatedSubLaneMask[Elt];
+ if (M < 0)
+ continue;
+ int Idx = (SubLane * NumSubLaneElts) + Elt;
+ RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+ }
+ }
+ SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
+
+ // Shuffle each source sub-lane to its destination.
+ SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+ int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+ if (SrcSubLane < 0)
+ continue;
+ for (int j = 0; j != NumSubLaneElts; ++j)
+ SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
+ }
+
+ return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+ SubLaneMask);
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
@@ -10571,25 +11194,24 @@ static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
SmallVector<int, 4> WidenedMask;
if (canWidenShuffleElements(Mask, WidenedMask))
- return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
- DAG);
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG))
+ return V;
- if (isSingleInputShuffleMask(Mask)) {
+ if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
@@ -10597,7 +11219,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
- // Non-half-crossing single input shuffles can be lowerid with an
+ // Non-half-crossing single input shuffles can be lowered with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
@@ -10606,10 +11228,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// With AVX2 we have direct support for this permutation.
- if (Subtarget->hasAVX2())
+ if (Subtarget.hasAVX2())
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
DAG);
@@ -10629,19 +11257,25 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
return Op;
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
- if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
- isShuffleMaskInputInPlace(1, Mask))))
+ if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
- if (Subtarget->hasAVX2())
+ if (Subtarget.hasAVX2())
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
Mask, DAG);
@@ -10653,59 +11287,53 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
-static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+ assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
SmallVector<int, 4> WidenedMask;
if (canWidenShuffleElements(Mask, WidenedMask))
- return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
- DAG);
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
- // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
- // use lower latency instructions that will operate on both 128-bit lanes.
- SmallVector<int, 2> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
- if (isSingleInputShuffleMask(Mask)) {
- int PSHUFDMask[] = {-1, -1, -1, -1};
- for (int i = 0; i < 2; ++i)
- if (RepeatedMask[i] >= 0) {
- PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
- PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
- }
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on both lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
DAG.getBitcast(MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
- }
- // AVX2 provides a direct instruction for permuting a single input across
- // lanes.
- if (isSingleInputShuffleMask(Mask))
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ }
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -10717,7 +11345,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
- if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
@@ -10732,14 +11360,12 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10747,7 +11373,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10759,12 +11385,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
- if (isSingleInputShuffleMask(Mask))
+ if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
@@ -10774,30 +11400,30 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
- // have already handled any direct blends. We also need to squash the
- // repeated mask into a simulated v4f32 mask.
- for (int i = 0; i < 4; ++i)
- if (RepeatedMask[i] >= 8)
- RepeatedMask[i] -= 4;
+ // have already handled any direct blends.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
}
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
- if (isSingleInputShuffleMask(Mask)) {
+ if (V2.isUndef()) {
SDValue VPermMask[8];
for (int i = 0; i < 8; ++i)
VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
: DAG.getConstant(Mask[i], DL, MVT::i32);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
- return DAG.getNode(
- X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+ DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
- if (Subtarget->hasAVX2())
- return DAG.getNode(
- X86ISD::VPERMV, DL, MVT::v8f32,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+ if (Subtarget.hasAVX2())
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
+ DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10812,7 +11438,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
- if (Subtarget->hasAVX2())
+ if (Subtarget.hasAVX2())
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
Mask, DAG);
@@ -10824,16 +11450,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
-static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+ assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
@@ -10847,7 +11471,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10857,7 +11481,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
- if (isSingleInputShuffleMask(Mask))
+ if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
@@ -10868,24 +11492,30 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
+ // Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
- if (isSingleInputShuffleMask(Mask)) {
+ if (V2.isUndef()) {
SDValue VPermMask[8];
for (int i = 0; i < 8; ++i)
VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
: DAG.getConstant(Mask[i], DL, MVT::i32);
- return DAG.getNode(
- X86ISD::VPERMV, DL, MVT::v8i32,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
+ DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -10903,16 +11533,14 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
-static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
@@ -10922,7 +11550,7 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10936,8 +11564,8 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return V;
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -10945,7 +11573,13 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- if (isSingleInputShuffleMask(Mask)) {
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (V2.isUndef()) {
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@@ -10960,26 +11594,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerV8I16GeneralSingleInputVectorShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}
-
- SDValue PSHUFBMask[32];
- for (int i = 0; i < 16; ++i) {
- if (Mask[i] == -1) {
- PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
- continue;
- }
-
- int M = i < 8 ? Mask[i] : Mask[i] - 8;
- assert(M >= 0 && M < 8 && "Invalid single-input mask!");
- PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
- PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
- }
- return DAG.getBitcast(MVT::v16i16,
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
- DAG.getBitcast(MVT::v32i8, V1),
- DAG.getNode(ISD::BUILD_VECTOR, DL,
- MVT::v32i8, PSHUFBMask)));
}
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
+ V2, Subtarget, DAG))
+ return PSHUFB;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -10994,16 +11614,14 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
-static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
- assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
@@ -11013,7 +11631,7 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
@@ -11027,8 +11645,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return V;
// Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -11036,25 +11654,21 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- if (isSingleInputShuffleMask(Mask)) {
- // There are no generalized cross-lane shuffle operations available on i8
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
- SDValue PSHUFBMask[32];
- for (int i = 0; i < 32; ++i)
- PSHUFBMask[i] =
- Mask[i] < 0
- ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
- MVT::i8);
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
+ DAG);
- return DAG.getNode(
- X86ISD::PSHUFB, DL, MVT::v32i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
- }
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
+ V2, Subtarget, DAG))
+ return PSHUFB;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -11071,19 +11685,14 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
-
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
- int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
- return M >= NumElts;
- });
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
@@ -11101,11 +11710,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// essentially *zero* ability to manipulate a 256-bit vector with integer
// types. Since we'll use floating point types there eventually, just
// immediately cast everything to a float and operate entirely in that domain.
- if (VT.isInteger() && !Subtarget->hasAVX2()) {
+ if (VT.isInteger() && !Subtarget.hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
- if (ElementBits < 32)
- // No floating point type available, decompose into 128-bit vectors.
+ if (ElementBits < 32) {
+ // No floating point type available, if we can't use the bit operations
+ // for masking/blending then decompose into 128-bit vectors.
+ if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+ return V;
+ if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
VT.getVectorNumElements());
@@ -11116,17 +11731,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
switch (VT.SimpleTy) {
case MVT::v4f64:
- return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v8f32:
- return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v8i32:
- return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v16i16:
- return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -11134,21 +11749,37 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
- ArrayRef<int> Mask,
- SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
// To handle 256 bit vector requires VLX and most probably
// function lowerV2X128VectorShuffle() is better solution.
- assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+ assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+ // Insure elements came from the same Op.
+ int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+ if (WidenedMask[i] == SM_SentinelUndef)
+ continue;
+
+ SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
+ unsigned OpIndex = (i < Size/2) ? 0 : 1;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+ }
+
// Form a 128-bit permutation.
// Convert the 64-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
@@ -11156,19 +11787,16 @@ static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
unsigned ControlBitsNum = WidenedMask.size() / 2;
for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- if (WidenedMask[i] == SM_SentinelZero)
- return SDValue();
-
// Use first element in place of undef mask.
Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
}
- return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
DAG.getConstant(PermMask, DL, MVT::i8));
}
-static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
@@ -11178,23 +11806,43 @@ static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
- if (isSingleInputShuffleMask(Mask))
+ if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
}
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ if (V2.isUndef()) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
+
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowered with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
+ ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
+ ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
+ DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ }
+
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+ }
+
if (SDValue Shuf128 =
lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Shuf128;
@@ -11203,42 +11851,90 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (SDValue Op =
+ lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Op;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
- return Unpck;
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
+ if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
+
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return Unpck;
+
+ // Otherwise, fall back to a SHUFPS sequence.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ }
return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (SDValue Shuf128 =
lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Shuf128;
+ if (V2.isUndef()) {
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+ // can use lower latency instructions that will operate on all four
+ // 128-bit lanes.
+ SmallVector<int, 2> Repeated128Mask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
+ SmallVector<int, 4> PSHUFDMask;
+ scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
+ return DAG.getBitcast(
+ MVT::v8i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
+ DAG.getBitcast(MVT::v16i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+ }
+
+ SmallVector<int, 4> Repeated256Mask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
+ getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Shift;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
@@ -11247,49 +11943,111 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
- return Unpck;
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the four 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ return V;
+ }
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
-static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
- assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+ assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (V2.isUndef()) {
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v32 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+ }
return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
-static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
- assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+ assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
+ V2, Subtarget, DAG))
+ return PSHUFB;
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
@@ -11300,61 +12058,50 @@ static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
- assert(Subtarget->hasAVX512() &&
+ assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
// Check for being able to broadcast a single element.
if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
- // Dispatch to each element type for lowering. If we don't have supprot for
+ // Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v32i16:
- if (Subtarget->hasBWI())
- return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
case MVT::v64i8:
- if (Subtarget->hasBWI())
- return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
}
-
- // Otherwise fall back on splitting.
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
}
// Lower vXi1 vector shuffles.
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
-static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- MVT VT, const X86Subtarget *Subtarget,
+static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
- assert(Subtarget->hasAVX512() &&
+ assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
MVT ExtVT;
switch (VT.SimpleTy) {
@@ -11405,7 +12152,7 @@ static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// above in helper routines. The canonicalization attempts to widen shuffles
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
/// s.t. only one of the two inputs needs to be tested, etc.
-static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
@@ -11413,14 +12160,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getSimpleValueType();
int NumElements = VT.getVectorNumElements();
- SDLoc dl(Op);
+ SDLoc DL(Op);
bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
"Can't lower MMX shuffles");
- bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsUndef = V1.isUndef();
+ bool V2IsUndef = V2.isUndef();
if (V1IsUndef && V2IsUndef)
return DAG.getUNDEF(VT);
@@ -11440,7 +12187,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
for (int &M : NewMask)
if (M >= NumElements)
M = -1;
- return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
}
// We actually see shuffles that are entirely re-arrangements of a set of
@@ -11448,7 +12195,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// simple ones. Directly lower these as a buildvector of zeros.
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
if (Zeroable.all())
- return getZeroVector(VT, Subtarget, DAG, dl);
+ return getZeroVector(VT, Subtarget, DAG, DL);
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
@@ -11467,12 +12214,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
V1 = DAG.getBitcast(NewVT, V1);
V2 = DAG.getBitcast(NewVT, V2);
return DAG.getBitcast(
- VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+ VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
}
}
int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
- for (int M : SVOp->getMask())
+ for (int M : Mask)
if (M < 0)
++NumUndefElements;
else if (M < NumElements)
@@ -11486,6 +12233,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
if (NumV2Elements > NumV1Elements)
return DAG.getCommutedVectorShuffle(*SVOp);
+ assert(NumV1Elements > 0 && "No V1 indices");
+ assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
+
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
@@ -11493,28 +12243,28 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
- for (int M : SVOp->getMask().slice(0, NumElements / 2))
+ for (int M : Mask.slice(0, NumElements / 2))
if (M >= NumElements)
++LowV2Elements;
else if (M >= 0)
++LowV1Elements;
- if (LowV2Elements > LowV1Elements) {
+ if (LowV2Elements > LowV1Elements)
return DAG.getCommutedVectorShuffle(*SVOp);
- } else if (LowV2Elements == LowV1Elements) {
+ if (LowV2Elements == LowV1Elements) {
int SumV1Indices = 0, SumV2Indices = 0;
- for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
- if (SVOp->getMask()[i] >= NumElements)
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
SumV2Indices += i;
- else if (SVOp->getMask()[i] >= 0)
+ else if (Mask[i] >= 0)
SumV1Indices += i;
- if (SumV2Indices < SumV1Indices) {
+ if (SumV2Indices < SumV1Indices)
return DAG.getCommutedVectorShuffle(*SVOp);
- } else if (SumV2Indices == SumV1Indices) {
+ if (SumV2Indices == SumV1Indices) {
int NumV1OddIndices = 0, NumV2OddIndices = 0;
- for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
- if (SVOp->getMask()[i] >= NumElements)
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
NumV2OddIndices += i % 2;
- else if (SVOp->getMask()[i] >= 0)
+ else if (Mask[i] >= 0)
NumV1OddIndices += i % 2;
if (NumV2OddIndices < NumV1OddIndices)
return DAG.getCommutedVectorShuffle(*SVOp);
@@ -11524,69 +12274,23 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
if (VT.is256BitVector())
- return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
if (VT.is512BitVector())
- return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
if (Is1BitVector)
- return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
- llvm_unreachable("Unimplemented!");
-}
+ return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
-// This function assumes its argument is a BUILD_VECTOR of constants or
-// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
-// true.
-static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
- unsigned &MaskValue) {
- MaskValue = 0;
- unsigned NumElems = BuildVector->getNumOperands();
-
- // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
- // We don't handle the >2 lanes case right now.
- unsigned NumLanes = (NumElems - 1) / 8 + 1;
- if (NumLanes > 2)
- return false;
-
- unsigned NumElemsInLane = NumElems / NumLanes;
-
- // Blend for v16i16 should be symmetric for the both lanes.
- for (unsigned i = 0; i < NumElemsInLane; ++i) {
- SDValue EltCond = BuildVector->getOperand(i);
- SDValue SndLaneEltCond =
- (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
-
- int Lane1Cond = -1, Lane2Cond = -1;
- if (isa<ConstantSDNode>(EltCond))
- Lane1Cond = !isNullConstant(EltCond);
- if (isa<ConstantSDNode>(SndLaneEltCond))
- Lane2Cond = !isNullConstant(SndLaneEltCond);
-
- unsigned LaneMask = 0;
- if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
- // Lane1Cond != 0, means we want the first argument.
- // Lane1Cond == 0, means we want the second argument.
- // The encoding of this argument is 0 for the first argument, 1
- // for the second. Therefore, invert the condition.
- LaneMask = !Lane1Cond << i;
- else if (Lane1Cond < 0)
- LaneMask = !Lane2Cond << i;
- else
- return false;
-
- MaskValue |= LaneMask;
- if (NumLanes == 2)
- MaskValue |= LaneMask << NumElemsInLane;
- }
- return true;
+ llvm_unreachable("Unimplemented!");
}
/// \brief Try to lower a VSELECT instruction to a vector shuffle.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
@@ -11624,7 +12328,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
return BlendOp;
// Variable blends are only legal from SSE4.1 onward.
- if (!Subtarget->hasSSE41())
+ if (!Subtarget.hasSSE41())
return SDValue();
// Only some types will be legal on some subtargets. If we can emit a legal
@@ -11637,7 +12341,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
case MVT::v32i8:
// The byte blends for AVX vectors were introduced only in AVX2.
- if (Subtarget->hasAVX2())
+ if (Subtarget.hasAVX2())
return Op;
return SDValue();
@@ -11645,7 +12349,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
case MVT::v8i16:
case MVT::v16i16:
// AVX-512 BWI and VLX features support VSELECT with i16 elements.
- if (Subtarget->hasBWI() && Subtarget->hasVLX())
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
return Op;
// FIXME: We should custom lower this by fixing the condition and using i8
@@ -11723,7 +12427,7 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
- assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
@@ -11737,10 +12441,15 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- const TargetRegisterClass* rc = getRegClassFor(VecVT);
- if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
- rc = getRegClassFor(MVT::v16i1);
- unsigned MaxSift = rc->getSize()*8 - 1;
+ if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
+ // Use kshiftlw/rw instruction.
+ VecVT = MVT::v16i1;
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
+ DAG.getUNDEF(VecVT),
+ Vec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ unsigned MaxSift = VecVT.getVectorNumElements() - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
@@ -11762,7 +12471,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (!isa<ConstantSDNode>(Idx)) {
if (VecVT.is512BitVector() ||
- (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+ (VecVT.is256BitVector() && Subtarget.hasInt256() &&
VecVT.getVectorElementType().getSizeInBits() == 32)) {
MVT MaskEltVT =
@@ -11782,13 +12491,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return SDValue();
}
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
-
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
- Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
@@ -11803,38 +12512,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
assert(VecVT.is128BitVector() && "Unexpected vector length");
- if (Subtarget->hasSSE41())
+ if (Subtarget.hasSSE41())
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
MVT VT = Op.getSimpleValueType();
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
- SDValue Vec = Op.getOperand(0);
- if (isNullConstant(Op.getOperand(1)))
+ if (IdxVal == 0)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getBitcast(MVT::v4i32, Vec),
- Op.getOperand(1)));
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
// Transform it so it match pextrw which produces a 32-bit result.
MVT EltVT = MVT::i32;
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
- Op.getOperand(0), Op.getOperand(1));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
if (VT.getSizeInBits() == 32) {
- unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- if (Idx == 0)
+ if (IdxVal == 0)
return Op;
// SHUFPS the element to the lowest double word, then movss.
- int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
- MVT VVT = Op.getOperand(0).getSimpleValueType();
- SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
- DAG.getUNDEF(VVT), Mask);
+ int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
}
@@ -11843,16 +12547,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
- if (isNullConstant(Op.getOperand(1)))
+ if (IdxVal == 0)
return Op;
// UNPCKHPD the element to the lowest double word, then movsd.
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
int Mask[2] = { 1, -1 };
- MVT VVT = Op.getOperand(0).getSimpleValueType();
- SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
- DAG.getUNDEF(VVT), Mask);
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
}
@@ -11886,7 +12588,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
if (IdxVal)
EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
- if (Vec.getOpcode() == ISD::UNDEF)
+ if (Vec.isUndef())
return EltInVec;
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
@@ -11895,6 +12597,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
if (EltVT == MVT::i1)
return InsertBitToMaskVector(Op, DAG);
@@ -11908,6 +12611,19 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
auto *N2C = cast<ConstantSDNode>(N2);
unsigned IdxVal = N2C->getZExtValue();
+ // If we are clearing out a element, we do this more efficiently with a
+ // blend shuffle than a costly integer insertion.
+ // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+ // TODO: pre-SSE41 targets will tend to use bit masking - this could still
+ // be beneficial if we are inserting several zeros and can combine the masks.
+ if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
+ SmallVector<int, 8> ClearMask;
+ for (unsigned i = 0; i != NumElts; ++i)
+ ClearMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+ }
+
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -11917,8 +12633,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// TODO: It is worthwhile to cast integer to floating point and back
// and incur a domain crossing penalty if that's what we'll end up
// doing anyway after extracting to a 128-bit vector.
- if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
- (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+ if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
N2 = DAG.getIntPtrConstant(1, dl);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
@@ -11926,7 +12642,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
// Get the desired 128-bit vector chunk.
- SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+ SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
@@ -11938,11 +12654,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
DAG.getConstant(IdxIn128, dl, MVT::i32));
// Insert the changed part back into the bigger vector
- return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+ return insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
unsigned Opc;
if (VT == MVT::v8i16) {
@@ -12026,7 +12742,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
// Insert the 128-bit vector.
- return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+ return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
if (OpVT == MVT::v1i64 &&
@@ -12042,7 +12758,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
// a simple subregister reference or explicit instructions to grab
// upper bits of a vector.
-static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
@@ -12051,15 +12767,15 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
MVT ResVT = Op.getSimpleValueType();
MVT InVT = In.getSimpleValueType();
- if (Subtarget->hasFp256()) {
+ if (Subtarget.hasFp256()) {
if (ResVT.is128BitVector() &&
(InVT.is256BitVector() || InVT.is512BitVector()) &&
isa<ConstantSDNode>(Idx)) {
- return Extract128BitVector(In, IdxVal, DAG, dl);
+ return extract128BitVector(In, IdxVal, DAG, dl);
}
if (ResVT.is256BitVector() && InVT.is512BitVector() &&
isa<ConstantSDNode>(Idx)) {
- return Extract256BitVector(In, IdxVal, DAG, dl);
+ return extract256BitVector(In, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -12068,9 +12784,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
-static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget->hasAVX())
+ if (!Subtarget.hasAVX())
return SDValue();
SDLoc dl(Op);
@@ -12094,16 +12810,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
if (Idx2 && Idx2->getZExtValue() == 0) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through a bitcast to get to the load.
- if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
- SubVec2 = SubVec2.getOperand(0);
-
+ // If needed, look through bitcasts to get to the load.
+ SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
bool Fast;
unsigned Alignment = FirstLd->getAlignment();
unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
OpVT, AS, Alignment, &Fast) && Fast) {
SDValue Ops[] = { SubVec2, SubVec };
@@ -12116,13 +12829,13 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
SubVecVT.is128BitVector())
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
if (OpVT.getVectorElementType() == MVT::i1)
- return Insert1BitVector(Op, DAG);
+ return insert1BitVector(Op, DAG, Subtarget);
return SDValue();
}
@@ -12139,17 +12852,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- unsigned char OpFlag = 0;
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
unsigned WrapperKind = X86ISD::Wrapper;
CodeModel::Model M = DAG.getTarget().getCodeModel();
- if (Subtarget->isPICStyleRIPRel() &&
+ if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
WrapperKind = X86ISD::WrapperRIP;
- else if (Subtarget->isPICStyleGOT())
- OpFlag = X86II::MO_GOTOFF;
- else if (Subtarget->isPICStyleStubPIC())
- OpFlag = X86II::MO_PIC_BASE_OFFSET;
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
@@ -12171,17 +12880,13 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- unsigned char OpFlag = 0;
+ unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
unsigned WrapperKind = X86ISD::Wrapper;
CodeModel::Model M = DAG.getTarget().getCodeModel();
- if (Subtarget->isPICStyleRIPRel() &&
+ if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
WrapperKind = X86ISD::WrapperRIP;
- else if (Subtarget->isPICStyleGOT())
- OpFlag = X86II::MO_GOTOFF;
- else if (Subtarget->isPICStyleStubPIC())
- OpFlag = X86II::MO_PIC_BASE_OFFSET;
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
@@ -12203,22 +12908,14 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- unsigned char OpFlag = 0;
+ const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
unsigned WrapperKind = X86ISD::Wrapper;
CodeModel::Model M = DAG.getTarget().getCodeModel();
- if (Subtarget->isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel)) {
- if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
- OpFlag = X86II::MO_GOTPCREL;
+ if (Subtarget.isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
WrapperKind = X86ISD::WrapperRIP;
- } else if (Subtarget->isPICStyleGOT()) {
- OpFlag = X86II::MO_GOT;
- } else if (Subtarget->isPICStyleStubPIC()) {
- OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
- } else if (Subtarget->isPICStyleStubNoDynamic()) {
- OpFlag = X86II::MO_DARWIN_NONLAZY;
- }
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
@@ -12227,8 +12924,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
- if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
- !Subtarget->is64Bit()) {
+ if (isPositionIndependent() && !Subtarget.is64Bit()) {
Result =
DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
@@ -12238,8 +12934,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// load.
if (isGlobalStubReference(OpFlag))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
}
@@ -12248,7 +12943,7 @@ SDValue
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
unsigned char OpFlags =
- Subtarget->ClassifyBlockAddressReference();
+ Subtarget.classifyBlockAddressReference();
CodeModel::Model M = DAG.getTarget().getCodeModel();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
@@ -12256,7 +12951,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
- if (Subtarget->isPICStyleRIPRel() &&
+ if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
else
@@ -12271,13 +12966,12 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
- int64_t Offset, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+ const SDLoc &dl, int64_t Offset,
+ SelectionDAG &DAG) const {
// Create the TargetGlobalAddress node, folding in the constant
// offset if it is legal.
- unsigned char OpFlags =
- Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+ unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
@@ -12290,7 +12984,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
}
- if (Subtarget->isPICStyleRIPRel() &&
+ if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
else
@@ -12306,8 +13000,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
// load.
if (isGlobalStubReference(OpFlags))
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
@@ -12429,7 +13122,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
SDValue ThreadPointer =
DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
- MachinePointerInfo(Ptr), false, false, false, 0);
+ MachinePointerInfo(Ptr));
unsigned char OperandFlags = 0;
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
@@ -12464,8 +13157,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
}
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
}
// The address of the thread local variable is the add of the thread
@@ -12478,45 +13170,40 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- // Cygwin uses emutls.
- // FIXME: It may be EmulatedTLS-generic also for X86-Android.
- if (Subtarget->isTargetWindowsCygwin())
+ if (DAG.getTarget().Options.EmulatedTLS)
return LowerToTLSEmulatedModel(GA, DAG);
const GlobalValue *GV = GA->getGlobal();
auto PtrVT = getPointerTy(DAG.getDataLayout());
+ bool PositionIndependent = isPositionIndependent();
- if (Subtarget->isTargetELF()) {
- if (DAG.getTarget().Options.EmulatedTLS)
- return LowerToTLSEmulatedModel(GA, DAG);
+ if (Subtarget.isTargetELF()) {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
- if (Subtarget->is64Bit())
+ if (Subtarget.is64Bit())
return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
case TLSModel::LocalDynamic:
return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
- Subtarget->is64Bit());
+ Subtarget.is64Bit());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
- return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
- DAG.getTarget().getRelocationModel() ==
- Reloc::PIC_);
+ return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+ PositionIndependent);
}
llvm_unreachable("Unknown TLS model.");
}
- if (Subtarget->isTargetDarwin()) {
+ if (Subtarget.isTargetDarwin()) {
// Darwin only has one model of TLS. Lower to that.
unsigned char OpFlag = 0;
- unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+ unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
X86ISD::WrapperRIP : X86ISD::Wrapper;
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
- !Subtarget->is64Bit();
+ bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
if (PIC32)
OpFlag = X86II::MO_TLVP_PIC_BASE;
else
@@ -12540,9 +13227,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
- Chain =
- DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
- DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+ DAG.getIntPtrConstant(0, DL, true),
+ Chain.getValue(1), DL);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -12550,12 +13237,13 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// And our return value (tls address) is in the standard call return value
// location.
- unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
}
- if (Subtarget->isTargetKnownWindowsMSVC() ||
- Subtarget->isTargetWindowsGNU()) {
+ if (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium() ||
+ Subtarget.isTargetWindowsGNU()) {
// Just use the implicit TLS architecture
// Need to generate someting similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -12573,21 +13261,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
// use its literal value of 0x2C.
- Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+ Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
? Type::getInt8PtrTy(*DAG.getContext(),
256)
: Type::getInt32PtrTy(*DAG.getContext(),
257));
- SDValue TlsArray = Subtarget->is64Bit()
+ SDValue TlsArray = Subtarget.is64Bit()
? DAG.getIntPtrConstant(0x58, dl)
- : (Subtarget->isTargetWindowsGNU()
+ : (Subtarget.isTargetWindowsGNU()
? DAG.getIntPtrConstant(0x2C, dl)
: DAG.getExternalSymbol("_tls_array", PtrVT));
SDValue ThreadPointer =
- DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
- false, false, 0);
+ DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
SDValue res;
if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
@@ -12595,13 +13282,11 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
} else {
// Load the _tls_index variable
SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
- if (Subtarget->is64Bit())
+ if (Subtarget.is64Bit())
IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
- MachinePointerInfo(), MVT::i32, false, false,
- false, 0);
+ MachinePointerInfo(), MVT::i32);
else
- IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
- false, false, 0);
+ IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
auto &DL = DAG.getDataLayout();
SDValue Scale =
@@ -12611,8 +13296,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
}
- res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
- false, 0);
+ res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
// Get the offset of start of .tls section
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -12628,7 +13312,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("TLS not implemented for this target.");
}
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
@@ -12711,13 +13395,13 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
return Op;
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
- Subtarget->is64Bit()) {
+ Subtarget.is64Bit()) {
return Op;
}
SDValue ValueToStore = Op.getOperand(0);
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
- !Subtarget->is64Bit())
+ !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
@@ -12730,8 +13414,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
SDValue Chain = DAG.getStore(
DAG.getEntryNode(), dl, ValueToStore, StackSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
- false, 0);
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
}
@@ -12789,14 +13472,13 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
Ops, Op.getValueType(), MMO);
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- false, false, false, 0);
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
}
return Result;
}
-// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+/// 64-bit unsigned integer to double expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SelectionDAG &DAG) const {
// This algorithm is not obvious. Here it is what we're trying to output:
@@ -12837,20 +13519,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SDValue CLod0 =
DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- false, false, false, 16);
+ /* Alignment = */ 16);
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
SDValue CLod1 =
DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- false, false, false, 16);
+ /* Alignment = */ 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget->hasSSE3()) {
+ if (Subtarget.hasSSE3()) {
// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
@@ -12865,7 +13547,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
DAG.getIntPtrConstant(0, dl));
}
-// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+/// 32-bit unsigned integer to float expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -12945,10 +13627,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
- unsigned NumElts = VecIntVT.getVectorNumElements();
assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type");
- assert(NumElts <= 8 && "The size of the constant array must be fixed");
// In the #idef/#else code, we have in common:
// - The vector of constants:
@@ -12958,24 +13638,12 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// -- v >> 16
// Create the splat vector for 0x4b000000.
- SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
- SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
- CstLow, CstLow, CstLow, CstLow};
- SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
- makeArrayRef(&CstLowArray[0], NumElts));
+ SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
// Create the splat vector for 0x53000000.
- SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
- SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
- CstHigh, CstHigh, CstHigh, CstHigh};
- SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
- makeArrayRef(&CstHighArray[0], NumElts));
+ SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
// Create the right shift.
- SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
- SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
- CstShift, CstShift, CstShift, CstShift};
- SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
- makeArrayRef(&CstShiftArray[0], NumElts));
+ SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
SDValue Low, High;
@@ -12997,9 +13665,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
} else {
- SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
- SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
- CstMask, CstMask, CstMask);
+ SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
@@ -13009,12 +13675,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
}
// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
- SDValue CstFAdd = DAG.getConstantFP(
- APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
- SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
- CstFAdd, CstFAdd, CstFAdd, CstFAdd};
- SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
- makeArrayRef(&CstFAddArray[0], NumElts));
+ SDValue VecCstFAdd = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
@@ -13045,10 +13707,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
}
case MVT::v4i32:
case MVT::v8i32:
- return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+ return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
case MVT::v16i8:
case MVT::v16i16:
- assert(Subtarget->hasAVX512());
+ assert(Subtarget.hasAVX512());
return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
}
@@ -13072,8 +13734,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
- if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
- (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+ if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+ (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
// Conversions from unsigned i32 to f32/f64 are legal,
// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
return Op;
@@ -13083,34 +13745,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return LowerUINT_TO_FP_i64(Op, DAG);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG);
- if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
if (SrcVT == MVT::i32) {
- SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
- SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
+ SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot, MachinePointerInfo(),
- false, false, 0);
+ StackSlot, MachinePointerInfo());
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
- OffsetSlot, MachinePointerInfo(),
- false, false, 0);
+ OffsetSlot, MachinePointerInfo());
SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
return Fild;
}
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
SDValue ValueToStore = Op.getOperand(0);
- if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit())
+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
- StackSlot, MachinePointerInfo(),
- false, false, 0);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+ MachinePointerInfo());
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
@@ -13149,7 +13807,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Fudge = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
- false, false, false, 4);
+ /* Alignment = */ 4);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
@@ -13186,10 +13844,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned &&
DstTy == MVT::i64 &&
- (!Subtarget->is64Bit() ||
+ (!Subtarget.is64Bit() ||
!isScalarFPTypeInSSEReg(TheVT));
- if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+ if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -13204,7 +13862,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (DstTy == MVT::i32 &&
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
return std::make_pair(SDValue(), SDValue());
- if (Subtarget->is64Bit() &&
+ if (Subtarget.is64Bit() &&
DstTy == MVT::i64 &&
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
return std::make_pair(SDValue(), SDValue());
@@ -13280,8 +13938,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot,
- MachinePointerInfo::getFixedStack(MF, SSFI), false,
- false, 0);
+ MachinePointerInfo::getFixedStack(MF, SSFI));
SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
SDValue Ops[] = {
Chain, StackSlot, DAG.getValueType(TheVT)
@@ -13309,18 +13966,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
FistOps, DstTy, MMO);
- SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
- MachinePointerInfo(),
- false, false, false, 0);
- SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
- DAG.getConstant(4, DL, PtrVT));
+ SDValue Low32 =
+ DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
+ SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
- SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
- MachinePointerInfo(),
- false, false, false, 0);
+ SDValue High32 =
+ DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
// Join High32 and Low32 into a 64-bit result.
// (High32 << 32) | Low32
Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
@@ -13347,7 +14001,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
@@ -13374,7 +14028,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
return SDValue();
- if (Subtarget->hasInt256())
+ if (Subtarget.hasInt256())
return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
@@ -13393,41 +14047,46 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
}
static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ const X86Subtarget &Subtarget, SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc DL(Op);
unsigned int NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+ if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
return SDValue();
if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
assert(InVT.getVectorElementType() == MVT::i1);
- MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+
+ // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
+ MVT ExtVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
SDValue One =
DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
SDValue Zero =
DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
- SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
- if (VT.is512BitVector())
- return V;
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
+ SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ if (VT == ExtVT)
+ return SelectedVal;
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
}
-static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256())
+ if (Subtarget.hasFp256())
if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
return Res;
return SDValue();
}
-static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
@@ -13437,7 +14096,7 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
- if (Subtarget->hasFp256())
+ if (Subtarget.hasFp256())
if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
return Res;
@@ -13447,50 +14106,32 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
}
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
- assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
- // Shift LSB to MSB and use VPMOVB2M - SKX.
+ // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
- if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
- Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M
- ((InVT.is256BitVector() || InVT.is128BitVector()) &&
- InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
- Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
- // Shift packed bytes not supported natively, bitcast to dword
- MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
- SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
- DAG.getBitcast(ExtVT, In),
- DAG.getConstant(ShiftInx, DL, ExtVT));
- ShiftNode = DAG.getBitcast(InVT, ShiftNode);
- return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
- }
- if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
- Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M
- ((InVT.is256BitVector() || InVT.is128BitVector()) &&
- InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
- Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M
-
- SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
- DAG.getConstant(ShiftInx, DL, InVT));
- return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
- }
-
- // Shift LSB to MSB, extend if necessary and use TESTM.
- unsigned NumElts = InVT.getVectorNumElements();
- if (InVT.getSizeInBits() < 512 &&
- (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
- !Subtarget->hasVLX())) {
- assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
-
- // TESTD/Q should be used (if BW supported we use CVT2MASK above),
- // so vector should be extended to packed dword/qword.
+ if (InVT.getScalarSizeInBits() <= 16) {
+ if (Subtarget.hasBWI()) {
+ // legal, will go to VPMOVB2M, VPMOVW2M
+ // Shift packed bytes not supported natively, bitcast to word
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+ // Use TESTD/Q, extended vector to packed dword/qword.
+ assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ "Unexpected vector type.");
+ unsigned NumElts = InVT.getVectorNumElements();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
@@ -13523,16 +14164,16 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return LowerTruncateVecI1(Op, DAG, Subtarget);
// vpmovqb/w/d, vpmovdb/w, vpmovwb
- if (Subtarget->hasAVX512()) {
+ if (Subtarget.hasAVX512()) {
// word to byte only under BWI
- if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+ if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
return DAG.getNode(X86ISD::VTRUNC, DL, VT,
DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
- if (Subtarget->hasInt256()) {
+ if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
@@ -13553,7 +14194,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
// On AVX2, v8i32 -> v8i16 becomed PSHUFB.
- if (Subtarget->hasInt256()) {
+ if (Subtarget.hasInt256()) {
In = DAG.getBitcast(MVT::v32i8, In);
SmallVector<SDValue,32> pshufbMask;
@@ -13569,13 +14210,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
for (unsigned j = 0; j < 8; ++j)
pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+ SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
In = DAG.getBitcast(MVT::v4i64, In);
static const int ShufMask[] = {0, 2, -1, -1};
In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
- &ShufMask[0]);
+ ShufMask);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
return DAG.getBitcast(VT, In);
@@ -13611,7 +14252,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if (!VT.is128BitVector() || !InVT.is256BitVector())
return SDValue();
- assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+ assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
unsigned NumElems = VT.getVectorNumElements();
MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
@@ -13621,7 +14262,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
for (unsigned i = 0; i != NumElems; ++i)
MaskVec[i] = i * 2;
SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
- DAG.getUNDEF(NVT), &MaskVec[0]);
+ DAG.getUNDEF(NVT), MaskVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
DAG.getIntPtrConstant(0, DL));
}
@@ -13639,9 +14280,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
if (StackSlot.getNode())
// Load the result.
- return DAG.getLoad(Op.getValueType(), SDLoc(Op),
- FIST, StackSlot, MachinePointerInfo(),
- false, false, false, 0);
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
+ MachinePointerInfo());
// The node is the result.
return FIST;
@@ -13658,9 +14298,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
if (StackSlot.getNode())
// Load the result.
- return DAG.getLoad(Op.getValueType(), SDLoc(Op),
- FIST, StackSlot, MachinePointerInfo(),
- false, false, false, 0);
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
+ MachinePointerInfo());
// The node is the result.
return FIST;
@@ -13736,10 +14375,9 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask =
- DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- false, false, false, Alignment);
+ SDValue Mask = DAG.getLoad(
+ LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
@@ -13807,7 +14445,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Mask1 =
DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- false, false, false, 16);
+ /* Alignment = */ 16);
if (!IsF128)
Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
@@ -13833,7 +14471,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Val =
DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- false, false, false, 16);
+ /* Alignment = */ 16);
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (!isa<ConstantFPSDNode>(Op0)) {
if (!IsF128)
@@ -13852,18 +14490,25 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
- SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
- DAG.getConstant(1, dl, VT));
- return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
+ MVT OpVT = N0.getSimpleValueType();
+ assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
+ "Unexpected type for FGETSIGN");
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
+ MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
+ Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
+ Res = DAG.getZExtOrTrunc(Res, dl, VT);
+ Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
+ return Res;
}
// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
- if (!Subtarget->hasSSE41())
+ if (!Subtarget.hasSSE41())
return SDValue();
if (!Op->hasOneUse())
@@ -13969,9 +14614,27 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
+// Emit KTEST instruction for bit vectors on AVX-512
+static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Op.getOpcode() == ISD::BITCAST) {
+ auto hasKTEST = [&](MVT VT) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
+ (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
+ };
+ SDValue Op0 = Op.getOperand(0);
+ MVT Op0VT = Op0.getValueType().getSimpleVT();
+ if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
+ hasKTEST(Op0VT))
+ return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
+ }
+ return SDValue();
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::i1) {
SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
@@ -14014,10 +14677,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+ // Emit KTEST for bit vectors
+ if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+ return Node;
// Emit a CMP with 0, which is the TEST pattern.
- //if (Op.getValueType() == MVT::i1)
- // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
- // DAG.getConstant(0, MVT::i1));
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
}
@@ -14071,14 +14734,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
if (ConstantSDNode *C =
dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
// An add of one will be selected as an INC.
- if (C->isOne() && !Subtarget->slowIncDec()) {
+ if (C->isOne() && !Subtarget.slowIncDec()) {
Opcode = X86ISD::INC;
NumOperands = 1;
break;
}
// An add of negative one (subtract of one) will be selected as a DEC.
- if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
+ if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
Opcode = X86ISD::DEC;
NumOperands = 1;
break;
@@ -14106,18 +14769,26 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
if (!Mask.isSignedIntN(32)) // Avoid large immediates.
break;
- SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
- DAG.getConstant(Mask, dl, VT));
- DAG.ReplaceAllUsesWith(Op, New);
- Op = New;
+ Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, dl, VT));
}
break;
case ISD::AND:
- // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
// because a TEST instruction will be better.
- if (!hasNonFlagsUse(Op))
- break;
+ if (!hasNonFlagsUse(Op)) {
+ SDValue Op0 = ArithOp->getOperand(0);
+ SDValue Op1 = ArithOp->getOperand(1);
+ EVT VT = ArithOp.getValueType();
+ bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
+ bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+
+ // But if we can combine this into an ANDN operation, then create an AND
+ // now and allow it to be pattern matched into an ANDN.
+ if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
+ break;
+ }
// FALL THROUGH
case ISD::SUB:
case ISD::OR:
@@ -14137,8 +14808,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: {
if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
- SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
- if (EFLAGS.getNode())
+ if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
return EFLAGS;
}
Opcode = X86ISD::OR;
@@ -14190,11 +14860,15 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
}
}
- if (Opcode == 0)
+ if (Opcode == 0) {
+ // Emit KTEST for bit vectors
+ if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+ return Node;
+
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
-
+ }
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
@@ -14206,7 +14880,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- SDLoc dl, SelectionDAG &DAG) const {
+ const SDLoc &dl, SelectionDAG &DAG) const {
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG);
@@ -14215,13 +14889,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Do the comparison at i32 if it's smaller, besides the Atom case.
- // This avoids subregister aliasing issues. Keep the smaller reference
- // if we're optimizing for size, however, as that'll allow better folding
- // of memory operations.
- if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if ((Op0.getValueType() == MVT::i16 &&
+ (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
!DAG.getMachineFunction().getFunction()->optForMinSize() &&
- !Subtarget->isAtom()) {
+ !Subtarget.isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
@@ -14241,7 +14914,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SelectionDAG &DAG) const {
// If the subtarget does not support the FUCOMI instruction, floating-point
// comparisons have to be converted.
- if (Subtarget->hasCMov() ||
+ if (Subtarget.hasCMov() ||
Cmp.getOpcode() != X86ISD::CMP ||
!Cmp.getOperand(0).getValueType().isFloatingPoint() ||
!Cmp.getOperand(1).getValueType().isFloatingPoint())
@@ -14259,7 +14932,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
// Some 64-bit targets lack SAHF support, but they do support FCOMI.
- assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+ assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
@@ -14279,10 +14952,10 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget->hasSSE1())
+ if (VT == MVT::f32 && Subtarget.hasSSE1())
RecipOp = "sqrtf";
- else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX()))
RecipOp = "vec-sqrtf";
else
return SDValue();
@@ -14311,10 +14984,10 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget->hasSSE1())
+ if (VT == MVT::f32 && Subtarget.hasSSE1())
RecipOp = "divf";
- else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX()))
RecipOp = "vec-divf";
else
return SDValue();
@@ -14337,10 +15010,9 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
-/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
-/// if it's possible.
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
- SDLoc dl, SelectionDAG &DAG) const {
+ const SDLoc &dl, SelectionDAG &DAG) const {
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -14353,19 +15025,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
std::swap(Op0, Op1);
if (Op0.getOpcode() == ISD::SHL) {
if (isOneConstant(Op0.getOperand(0))) {
- // If we looked past a truncate, check that it's only truncating away
- // known zeros.
- unsigned BitWidth = Op0.getValueSizeInBits();
- unsigned AndBitWidth = And.getValueSizeInBits();
- if (BitWidth > AndBitWidth) {
- APInt Zeros, Ones;
- DAG.computeKnownBits(Op0, Zeros, Ones);
- if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
- return SDValue();
- }
- LHS = Op1;
- RHS = Op0.getOperand(1);
+ // If we looked past a truncate, check that it's only truncating away
+ // known zeros.
+ unsigned BitWidth = Op0.getValueSizeInBits();
+ unsigned AndBitWidth = And.getValueSizeInBits();
+ if (BitWidth > AndBitWidth) {
+ APInt Zeros, Ones;
+ DAG.computeKnownBits(Op0, Zeros, Ones);
+ if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+ return SDValue();
}
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
} else if (Op1.getOpcode() == ISD::Constant) {
ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
uint64_t AndRHSVal = AndRHS->getZExtValue();
@@ -14407,8 +15079,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
return SDValue();
}
-/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
-/// mask CMPs.
+/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
+/// CMPs.
static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
SDValue &Op1) {
unsigned SSECC;
@@ -14452,8 +15124,8 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
return SSECC;
}
-// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
-// ones, and then concatenate the result back.
+/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -14466,13 +15138,13 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+ SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+ SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
MVT EltVT = VT.getVectorElementType();
@@ -14525,16 +15197,15 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
}
}
-static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
- Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ assert(VT.getVectorElementType() == MVT::i1 &&
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
@@ -14568,8 +15239,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
/// operand \p Op1. If non-trivial (for example because it's not constant)
/// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
-{
+static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
+ SelectionDAG &DAG) {
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
if (!BV)
return SDValue();
@@ -14592,10 +15263,10 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
+ return DAG.getBuildVector(VT, dl, ULTOp1);
}
-static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -14611,32 +15282,59 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
- unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
- unsigned Opc = X86ISD::CMPP;
- if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+ unsigned Opc;
+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16);
Opc = X86ISD::CMPM;
- }
- // In the two special cases we can't handle, emit two comparisons.
+ } else {
+ Opc = X86ISD::CMPP;
+ // The SSE/AVX packed FP comparison nodes are defined with a
+ // floating-point vector result that matches the operand type. This allows
+ // them to work with an SSE1 target (integer vector types are not legal).
+ VT = Op0.getSimpleValueType();
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
+ // available.
+ SDValue Cmp;
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
if (SSECC == 8) {
+ // LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
if (SetCCOpcode == ISD::SETUEQ) {
- CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
+ static_cast<unsigned>(ISD::OR);
} else {
assert(SetCCOpcode == ISD::SETONE);
- CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
+ static_cast<unsigned>(ISD::AND);
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC1, dl, MVT::i8));
- return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+ } else {
+ // Handle all other FP comparisons here.
+ Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, dl, MVT::i8));
}
- // Handle all other FP comparisons here.
- return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, dl, MVT::i8));
+
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
+ // result type of SETCC. The bitcast is expected to be optimized away
+ // during combining/isel.
+ if (Opc == X86ISD::CMPP)
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+ return Cmp;
}
MVT VTOp0 = Op0.getSimpleValueType();
@@ -14665,38 +15363,38 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
- assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+ assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
// Break 256-bit integer vector compare into smaller ones.
- if (VT.is256BitVector() && !Subtarget->hasInt256())
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntVSETCC(Op, DAG);
+ // Operands are boolean (vectors of i1)
MVT OpVT = Op1.getSimpleValueType();
if (OpVT.getVectorElementType() == MVT::i1)
return LowerBoolVSETCC_AVX512(Op, DAG);
- bool MaskResult = (VT.getVectorElementType() == MVT::i1);
- if (Subtarget->hasAVX512()) {
- if (Op1.getSimpleValueType().is512BitVector() ||
- (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
- (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
- return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
-
+ // The result is boolean, but operands are int/float
+ if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
// But there is no compare instruction for i8 and i16 elements in KNL.
- // We are not talking about 512-bit operands in this case, these
- // types are illegal.
- if (MaskResult &&
- (OpVT.getVectorElementType().getSizeInBits() < 32 &&
- OpVT.getVectorElementType().getSizeInBits() >= 8))
- return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+ // In this case use SSE compare
+ bool UseAVX512Inst =
+ (OpVT.is512BitVector() ||
+ OpVT.getVectorElementType().getSizeInBits() >= 32 ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX()));
+
+ if (UseAVX512Inst)
+ return LowerIntVSETCC_AVX512(Op, DAG);
+
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
}
// Lower using XOP integer comparisons.
if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
- VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+ VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
switch (SetCCOpcode) {
@@ -14748,8 +15446,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// Special case: Use min/max operations for SETULE/SETUGE
MVT VET = VT.getVectorElementType();
bool hasMinMax =
- (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
- || (Subtarget->hasSSE2() && (VET == MVT::i8));
+ (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+ || (Subtarget.hasSSE2() && (VET == MVT::i8));
if (hasMinMax) {
switch (SetCCOpcode) {
@@ -14761,7 +15459,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
}
- bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
if (!MinMax && hasSubus) {
// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
// Op0 u<= Op1:
@@ -14775,10 +15473,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// beneficial because the constant in the register is no longer
// destructed as the destination so it can be hoisted out of a loop.
// Only do this pre-AVX since vpcmp* is no longer destructive.
- if (Subtarget->hasAVX())
+ if (Subtarget.hasAVX())
break;
- SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
- if (ULEOp1.getNode()) {
+ if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
Op1 = ULEOp1;
Subus = true; Invert = false; Swap = false;
}
@@ -14801,8 +15498,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
if (VT == MVT::v2i64) {
- if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
- assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+ if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
+ assert(Subtarget.hasSSE2() && "Don't know how to lower!");
// First cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
@@ -14817,8 +15514,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
} else {
SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
- SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
- Sign, Zero, Sign, Zero);
+ SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
}
Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
@@ -14843,10 +15539,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getBitcast(VT, Result);
}
- if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+ if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
// pcmpeqd + pshufd + pand.
- assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+ assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
// First cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
@@ -14899,7 +15595,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
- assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+ assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
&& "SetCC type must be 8-bit or 1-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -14914,8 +15610,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
- if (VT == MVT::i1)
+ if (VT == MVT::i1) {
+ NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
+ DAG.getValueType(MVT::i1));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+ }
return NewSetCC;
}
}
@@ -14937,16 +15636,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(CCode, dl, MVT::i8),
Op0.getOperand(1));
- if (VT == MVT::i1)
+ if (VT == MVT::i1) {
+ SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
+ DAG.getValueType(MVT::i1));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ }
return SetCC;
}
}
- if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
- (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-
- ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
- return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+ if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isOneConstant(Op1)) {
+ ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+ return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+ }
+ if (!isNullConstant(Op1)) {
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
+ return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
+ }
}
bool isFP = Op1.getSimpleValueType().isFloatingPoint();
@@ -14958,8 +15664,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
- if (VT == MVT::i1)
+ if (VT == MVT::i1) {
+ SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
+ DAG.getValueType(MVT::i1));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+ }
return SetCC;
}
@@ -14978,12 +15687,15 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
- if (Op.getSimpleValueType() == MVT::i1)
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ if (Op.getSimpleValueType() == MVT::i1) {
+ SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+ DAG.getValueType(MVT::i1));
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ }
return SetCC;
}
-// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+/// Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getNode()->getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
@@ -15009,14 +15721,23 @@ static bool isX86LogicalCmp(SDValue Op) {
return false;
}
-static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+/// Returns the "condition" node, that may be wrapped with "truncate".
+/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
+static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
- return false;
+ return V;
SDValue VOp0 = V.getOperand(0);
+ if (VOp0.getOpcode() == ISD::AssertZext &&
+ V.getValueSizeInBits() ==
+ cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
+ return VOp0.getOperand(0);
+
unsigned InBits = VOp0.getValueSizeInBits();
unsigned Bits = V.getValueSizeInBits();
- return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+ if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
+ return V.getOperand(0);
+ return V;
}
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -15032,15 +15753,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
- ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
- (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+ ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ (Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
int SSECC = translateX86FSETCC(
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
if (SSECC != 8) {
- if (Subtarget->hasAVX512()) {
+ if (Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
DAG.getConstant(SSECC, DL, MVT::i8));
return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
@@ -15062,7 +15783,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// instructions as the AND/ANDN/OR sequence due to register moves, so
// don't bother.
- if (Subtarget->hasAVX() &&
+ if (Subtarget.hasAVX() &&
!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
// Convert to vectors, do a VSELECT, and convert back to scalar.
@@ -15122,8 +15843,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (Cond.getOpcode() == ISD::SETCC) {
- SDValue NewCond = LowerSETCC(Cond, DAG);
- if (NewCond.getNode())
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
Cond = NewCond;
}
@@ -15240,8 +15960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
// Look past the truncate if the high bits are known zero.
- if (isTruncWithZeroHighBitsInput(Cond, DAG))
- Cond = Cond.getOperand(0);
+ Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
// We know the result of AND is compared against zero. Try to match
// it to BT.
@@ -15302,7 +16021,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
@@ -15313,22 +16032,22 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
// SKX processor
if ((InVTElt == MVT::i1) &&
- (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+ (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
- ((Subtarget->hasBWI() && VT.is512BitVector() &&
+ ((Subtarget.hasBWI() && VT.is512BitVector() &&
VTElt.getSizeInBits() <= 16)) ||
- ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+ ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
- ((Subtarget->hasDQI() && VT.is512BitVector() &&
+ ((Subtarget.hasDQI() && VT.is512BitVector() &&
VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
unsigned int NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+ if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
return SDValue();
if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
@@ -15352,25 +16071,35 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
}
static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
assert(VT.getSizeInBits() == InVT.getSizeInBits());
+ MVT SVT = VT.getVectorElementType();
MVT InSVT = InVT.getVectorElementType();
- assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
+ assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
- if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
+ if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
+ !(VT.is256BitVector() && Subtarget.hasInt256()))
+ return SDValue();
SDLoc dl(Op);
+ // For 256-bit vectors, we only need the lower (128-bit) half of the input.
+ if (VT.is256BitVector())
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+ MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
+ In, DAG.getIntPtrConstant(0, dl));
+
// SSE41 targets can use the pmovsx* instructions directly.
- if (Subtarget->hasSSE41())
+ if (Subtarget.hasSSE41())
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
@@ -15407,7 +16136,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
return SDValue();
}
-static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
@@ -15422,7 +16151,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
(VT != MVT::v16i16 || InVT != MVT::v16i8))
return SDValue();
- if (Subtarget->hasInt256())
+ if (Subtarget.hasInt256())
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
// Optimize vectors in AVX mode
@@ -15441,13 +16170,13 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
for (unsigned i = 0; i != NumElems/2; ++i)
ShufMask1[i] = i;
- SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+ SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
SmallVector<int,8> ShufMask2(NumElems, -1);
for (unsigned i = 0; i != NumElems/2; ++i)
ShufMask2[i] = i + NumElems/2;
- SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements()/2);
@@ -15458,6 +16187,157 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+// Lower truncating store. We need a special lowering to vXi1 vectors
+static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+ SDLoc dl(St);
+ EVT MemVT = St->getMemoryVT();
+ assert(St->isTruncatingStore() && "We only custom truncating store.");
+ assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
+ "Expected truncstore of i1 vector");
+
+ SDValue Op = St->getValue();
+ MVT OpVT = Op.getValueType().getSimpleVT();
+ unsigned NumElts = OpVT.getVectorNumElements();
+ if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ NumElts == 16) {
+ // Truncate and store - everything is legal
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
+ if (MemVT.getSizeInBits() < 8)
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+ St->getMemOperand());
+ }
+
+ // A subset, assume that we have only AVX-512F
+ if (NumElts <= 8) {
+ if (NumElts < 8) {
+ // Extend to 8-elts vector
+ MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
+ DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
+ }
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
+ return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+ St->getMemOperand());
+ }
+ // v32i8
+ assert(OpVT == MVT::v32i8 && "Unexpected operand type");
+ // Divide the vector into 2 parts and store each part separately
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+ DAG.getIntPtrConstant(0, dl));
+ Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
+ SDValue BasePtr = St->getBasePtr();
+ SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
+ St->getMemOperand());
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+ DAG.getIntPtrConstant(16, dl));
+ Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
+
+ SDValue BasePtrHi =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+ SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
+ BasePtrHi, St->getMemOperand());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
+}
+
+static SDValue LowerExtended1BitVectorLoad(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+
+ LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+ SDLoc dl(Ld);
+ EVT MemVT = Ld->getMemoryVT();
+ assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
+ "Expected i1 vector load");
+ unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
+ ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ MVT VT = Op.getValueType().getSimpleVT();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ NumElts == 16) {
+ // Load and extend - everything is legal
+ if (NumElts < 8) {
+ SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ // Finally, do a normal sign-extend to the desired register.
+ return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
+ }
+
+ if (NumElts <= 8) {
+ // A subset, assume that we have only AVX-512F
+ unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+ MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
+ SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
+ SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+
+ if (NumElts == 8)
+ return DAG.getNode(ExtOpcode, dl, VT, BitVec);
+
+ // we should take care to v4i1 and v2i1
+
+ MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+ SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert(VT == MVT::v32i8 && "Unexpected extload type");
+
+ SmallVector<SDValue, 2> Chains;
+
+ SDValue BasePtr = Ld->getBasePtr();
+ SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getMemOperand());
+ Chains.push_back(LoadLo.getValue(1));
+
+ SDValue BasePtrHi =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+ SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+ BasePtrHi,
+ Ld->getMemOperand());
+ Chains.push_back(LoadHi.getValue(1));
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+
+ SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
+ SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+}
+
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
@@ -15465,7 +16345,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector sext loads.");
@@ -15473,11 +16353,14 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
"We only custom lower integer vector sext loads.");
// Nothing useful we can do without SSE2 shuffles.
- assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+ assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
EVT MemVT = Ld->getMemoryVT();
+ if (MemVT.getScalarType() == MVT::i1)
+ return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
@@ -15492,7 +16375,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
- if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
// The only way in which we have a legal 256-bit vector result but not the
// integer 256-bit operations needed to directly lower a sextload is if we
// have AVX1 but not AVX2. In that case, we can always emit a sextload to
@@ -15508,8 +16391,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
"it must be a legal 128-bit vector "
"type!");
Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
- Ld->isInvariant(), Ld->getAlignment());
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
} else {
assert(MemSz < 128 &&
"Can't extend a type wider than 128 bits to a 256 bit vector!");
@@ -15522,9 +16405,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
Load =
DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
}
// Replace chain users with the new chain.
@@ -15592,8 +16474,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// Perform a single load.
SDValue ScalarLoad =
DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
- Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ Ld->getAlignment(), Ld->getMemOperand()->getFlags());
Chains.push_back(ScalarLoad.getValue(1));
// Create the first element type using SCALAR_TO_VECTOR in order to avoid
// another round of DAGCombining.
@@ -15615,7 +16496,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
if (Ext == ISD::SEXTLOAD) {
// If we have SSE4.1, we can directly emit a VSEXT node.
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Sext;
@@ -15637,7 +16518,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
ShuffleVec[i * SizeRatio] = i;
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
// Bitcast to the requested type.
Shuff = DAG.getBitcast(RegVT, Shuff);
@@ -15645,9 +16526,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
return Shuff;
}
-// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
-// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
-// from the AND / OR.
+/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
+/// each of which has no other use apart from the AND / OR.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Opc = Op.getOpcode();
if (Opc != ISD::OR && Opc != ISD::AND)
@@ -15658,8 +16538,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Op.getOperand(1).hasOneUse());
}
-// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
-// 1 and that the SETCC node has a single use.
+/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
+/// SETCC node has a single use.
static bool isXor1OfSetCC(SDValue Op) {
if (Op.getOpcode() != ISD::XOR)
return false;
@@ -15692,8 +16572,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
Inverted = true;
Cond = Cond.getOperand(0);
} else {
- SDValue NewCond = LowerSETCC(Cond, DAG);
- if (NewCond.getNode())
+ if (SDValue NewCond = LowerSETCC(Cond, DAG))
Cond = NewCond;
}
}
@@ -15917,8 +16796,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
// Look pass the truncate if the high bits are known zero.
- if (isTruncWithZeroHighBitsInput(Cond, DAG))
- Cond = Cond.getOperand(0);
+ Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
// We know the result of AND is compared against zero. Try to match
// it to BT.
@@ -15951,7 +16829,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
- bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
+ bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
SplitStack;
SDLoc dl(Op);
@@ -15966,7 +16844,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
- bool Is64Bit = Subtarget->is64Bit();
+ bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
SDValue Result;
@@ -15975,13 +16853,10 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
- EVT VT = Node->getValueType(0);
- SDValue Tmp3 = Node->getOperand(2);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
- unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
@@ -15995,12 +16870,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// The 64 bit implementation of segmented stacks needs to clobber both r10
// r11. This makes it impossible to use it along with nested parameters.
const Function *F = MF.getFunction();
-
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; ++I)
- if (I->hasNestAttr())
+ for (const auto &A : F->args()) {
+ if (A.hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
+ }
}
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
@@ -16009,16 +16883,11 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
} else {
- SDValue Flag;
- const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
-
- Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
- Flag = Chain.getValue(1);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
+ MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
- Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
@@ -16047,13 +16916,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SDLoc DL(Op);
- if (!Subtarget->is64Bit() ||
- Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+ if (!Subtarget.is64Bit() ||
+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
- MachinePointerInfo(SV), false, false, 0);
+ MachinePointerInfo(SV));
}
// __va_list_tag:
@@ -16064,45 +16933,45 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SmallVector<SDValue, 8> MemOps;
SDValue FIN = Op.getOperand(1);
// Store gp_offset
- SDValue Store = DAG.getStore(Op.getOperand(0), DL,
- DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
- DL, MVT::i32),
- FIN, MachinePointerInfo(SV), false, false, 0);
+ SDValue Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV));
MemOps.push_back(Store);
// Store fp_offset
- FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
- Store = DAG.getStore(Op.getOperand(0), DL,
- DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
- MVT::i32),
- FIN, MachinePointerInfo(SV, 4), false, false, 0);
+ FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
+ MachinePointerInfo(SV, 4));
MemOps.push_back(Store);
// Store ptr to overflow_arg_area
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
- Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
- MachinePointerInfo(SV, 8),
- false, false, 0);
+ Store =
+ DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
MemOps.push_back(Store);
// Store ptr to reg_save_area.
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
- Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
+ Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
- Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
- SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
+ Store = DAG.getStore(
+ Op.getOperand(0), DL, RSFIN, FIN,
+ MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
MemOps.push_back(Store);
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget->is64Bit() &&
+ assert(Subtarget.is64Bit() &&
"LowerVAARG only handles 64-bit va_arg!");
assert(Op.getNode()->getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
- if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
// The Win64 ABI uses char* instead of a structure.
return DAG.expandVAArg(Op.getNode());
@@ -16132,9 +17001,9 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
- assert(!Subtarget->useSoftFloat() &&
+ assert(!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
- Subtarget->hasSSE1());
+ Subtarget.hasSSE1());
}
// Insert VAARG_64 node into the DAG
@@ -16153,19 +17022,15 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Chain = VAARG.getValue(1);
// Load the next argument and return it
- return DAG.getLoad(ArgVT, dl,
- Chain,
- VAARG,
- MachinePointerInfo(),
- false, false, false, 0);
+ return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
}
-static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
// where a va_list is still an i8*.
- assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
- if (Subtarget->isCallingConvWin64(
+ assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
+ if (Subtarget.isCallingConvWin64(
DAG.getMachineFunction().getFunction()->getCallingConv()))
// Probably a Win64 va_copy.
return DAG.expandVACopy(Op.getNode());
@@ -16183,9 +17048,9 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
-// getTargetVShiftByConstNode - Handle vector element shifts where the shift
-// amount is a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+/// Handle vector element shifts where the shift amount is a constant.
+/// Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, uint64_t ShiftAmt,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
@@ -16214,11 +17079,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
ConstantSDNode *ND;
switch(Opc) {
- default: llvm_unreachable(nullptr);
+ default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
for (unsigned i=0; i!=NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
@@ -16230,7 +17095,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
case X86ISD::VSRLI:
for (unsigned i=0; i!=NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
@@ -16242,7 +17107,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
case X86ISD::VSRAI:
for (unsigned i=0; i!=NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->getOpcode() == ISD::UNDEF) {
+ if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
@@ -16253,16 +17118,16 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
break;
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ return DAG.getBuildVector(VT, dl, Elts);
}
return DAG.getNode(Opc, dl, VT, SrcOp,
DAG.getConstant(ShiftAmt, dl, MVT::i8));
}
-// getTargetVShiftNode - Handle vector element shifts where the shift amount
-// may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
+/// Handle vector element shifts where the shift amount may or may not be a
+/// constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
SelectionDAG &DAG) {
MVT SVT = ShAmt.getSimpleValueType();
@@ -16288,7 +17153,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
// Let the shuffle legalizer expand this shift amount node.
SDValue Op0 = ShAmt.getOperand(0);
Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
- ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
} else {
// Need to build a vector containing shift amount.
// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
@@ -16301,7 +17166,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
ShOps.push_back(DAG.getUNDEF(SVT));
MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
}
// The return type has to be a 128-bit type with the same element
@@ -16316,8 +17181,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
/// \brief Return Mask with the necessary casting or extending
/// for \p Mask according to \p MaskVT when lowering masking intrinsics
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG, SDLoc dl) {
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl) {
if (isAllOnesConstant(Mask))
return DAG.getTargetConstant(1, dl, MaskVT);
@@ -16330,9 +17195,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
}
- if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
if (MaskVT == MVT::v64i1) {
- assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
// In case 32bit mode, bitcast i64 is illegal, extend/split it.
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
@@ -16368,7 +17233,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
/// necessary casting or extending for \p Mask when lowering masking intrinsics
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
@@ -16393,13 +17258,14 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS:
+ case ISD::FP_TO_FP16:
// We can't use ISD::VSELECT here because it is not always "Legal"
// for the destination type. For example vpmovqb require only AVX512
// and vselect that can operate on byte element type require BWI
OpcodeSelect = X86ISD::SELECT;
break;
}
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
}
@@ -16413,7 +17279,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
/// for a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (isAllOnesConstant(Mask))
return Op;
@@ -16429,7 +17295,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
@@ -16495,7 +17361,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
}
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -16706,6 +17572,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case VPERM_2OP_MASK : {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+
+ // Swap Src1 and Src2 in the node creation
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
+ Mask, PassThru, Subtarget, DAG);
+ }
case VPERM_3OP_MASKZ:
case VPERM_3OP_MASK:{
// Src2 is the PassThru
@@ -16764,6 +17640,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case FMA_OP_SCALAR_MASK:
+ case FMA_OP_SCALAR_MASK3:
+ case FMA_OP_SCALAR_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
+ PassThru = Src3;
+ else
+ PassThru = Src1;
+
+ SDValue Rnd = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
+ Op.getValueType(), Src1, Src2,
+ Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
case TERLOG_OP_MASK:
case TERLOG_OP_MASKZ: {
SDValue Src1 = Op.getOperand(1);
@@ -16879,49 +17779,76 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
MVT::i1),
Subtarget, DAG);
- return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
- DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
- DAG.getValueType(MVT::i1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
- assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
- SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, dl, MVT::i8), Cond);
+ SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+ SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
+ SDValue SetCC;
+ switch (CC) {
+ case ISD::SETEQ: { // (ZF = 0 and PF = 0)
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
+ SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_NP, dl, MVT::i8),
+ Comi);
+ SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
+ break;
+ }
+ case ISD::SETNE: { // (ZF = 1 or PF = 1)
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
+ SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_P, dl, MVT::i8),
+ Comi);
+ SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
+ break;
+ }
+ case ISD::SETGT: // (CF = 0 and ZF = 0)
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
+ break;
+ case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
+ break;
+ }
+ case ISD::SETGE: // CF = 0
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
+ break;
+ case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
+ SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
+ break;
+ default:
+ llvm_unreachable("Unexpected illegal condition!");
+ }
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case COMI_RM: { // Comparison intrinsics with Sae
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- SDValue CC = Op.getOperand(3);
+ unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
SDValue Sae = Op.getOperand(4);
- auto ComiType = TranslateX86ConstCondToX86CC(CC);
- // choose between ordered and unordered (comi/ucomi)
- unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
- SDValue Cond;
- if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
- X86::STATIC_ROUNDING::CUR_DIRECTION)
- Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+
+ SDValue FCmp;
+ if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8));
else
- Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
+ return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
- case VSHIFT_MASK:
- return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
- Op.getSimpleValueType(),
- Op.getOperand(1),
- Op.getOperand(2), DAG),
- Op.getOperand(4), Op.getOperand(3), Subtarget,
- DAG);
case COMPRESS_EXPAND_IN_REG: {
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
@@ -16940,14 +17867,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
- case BLEND: {
- SDValue Mask = Op.getOperand(3);
- MVT VT = Op.getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
- Op.getOperand(2));
- }
case KUNPCK: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
@@ -16960,6 +17879,35 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
+ case FIXUPIMMS:
+ case FIXUPIMMS_MASKZ:
+ case FIXUPIMM:
+ case FIXUPIMM_MASKZ:{
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Imm = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
+ Src1 : getZeroVector(VT, Subtarget, DAG, dl);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (7 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 7)
+ Rnd = Op.getOperand(6);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Imm, Rnd),
+ Mask, Passthru, Subtarget, DAG);
+ else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Imm, Rnd),
+ Mask, Passthru, Subtarget, DAG);
+ }
case CONVERT_TO_MASK: {
MVT SrcVT = Op.getOperand(1).getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
@@ -16995,6 +17943,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
subVec, subVec, immVal),
Mask, Passthru, Subtarget, DAG);
}
+ case BRCST32x2_TO_VEC: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ assert((VT.getScalarType() == MVT::i32 ||
+ VT.getScalarType() == MVT::f32) && "Unexpected type!");
+ //bitcast Src to packed 64
+ MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
+ MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
+ Src = DAG.getBitcast(BitcastVT, Src);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
default:
break;
}
@@ -17082,7 +18045,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@@ -17163,6 +18126,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ auto &Context = MF.getMMI().getContext();
+ MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+ Twine(MF.getFunctionNumber()));
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
+ }
+
case Intrinsic::x86_seh_lsda: {
// Compute the symbol for the LSDA. We know it'll get emitted later.
MachineFunction &MF = DAG.getMachineFunction();
@@ -17192,7 +18165,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
// Returns one of the stack, base, or frame pointer registers, depending on
// which is used to reference local variables.
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
@@ -17206,7 +18179,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
- const X86Subtarget * Subtarget) {
+ const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17217,7 +18190,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- if (Src.getOpcode() == ISD::UNDEF)
+ if (Src.isUndef())
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -17237,7 +18210,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -17255,18 +18228,19 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
//SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
}
-// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
-// read performance monitor counters (x86_rdpmc).
-static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
- SelectionDAG &DAG, const X86Subtarget *Subtarget,
- SmallVectorImpl<SDValue> &Results) {
+/// Handles the lowering of builtin intrinsics that read performance monitor
+/// counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue LO, HI;
@@ -17279,7 +18253,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
// Reads the content of a 64-bit performance counter and returns it in the
// registers EDX:EAX.
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
LO.getValue(2));
@@ -17290,7 +18264,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
}
Chain = HI.getValue(1);
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
// The EAX register is loaded with the low-order 32 bits. The EDX register
// is loaded with the supported high-order bits of the counter.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
@@ -17307,12 +18281,13 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
Results.push_back(Chain);
}
-// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
-// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
-// also used to custom lower READCYCLECOUNTER nodes.
-static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
- SelectionDAG &DAG, const X86Subtarget *Subtarget,
- SmallVectorImpl<SDValue> &Results) {
+/// Handles the lowering of builtin intrinsics that read the time stamp counter
+/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
+/// READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
SDValue LO, HI;
@@ -17320,7 +18295,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
LO.getValue(2));
@@ -17341,10 +18316,10 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
// Explicitly store the content of ECX at the location passed in input
// to the 'rdtscp' intrinsic.
Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo());
}
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
// The EDX register is loaded with the high-order 32 bits of the MSR, and
// the EAX register is loaded with the low-order 32 bits.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
@@ -17361,7 +18336,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
Results.push_back(Chain);
}
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 2> Results;
SDLoc DL(Op);
@@ -17388,44 +18363,25 @@ static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
return Chain;
}
-/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
-/// return truncate Store/MaskedStore Node
-static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
- SelectionDAG &DAG,
- MVT ElementType) {
- SDLoc dl(Op);
- SDValue Mask = Op.getOperand(4);
- SDValue DataToTruncate = Op.getOperand(3);
- SDValue Addr = Op.getOperand(2);
+static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
+ SDValue EHGuard = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EHGuard only live in functions using WinEH");
- MVT VT = DataToTruncate.getSimpleValueType();
- MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
-
- if (isAllOnesConstant(Mask)) // return just a truncate store
- return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
- MachinePointerInfo(), SVT, false, false,
- SVT.getScalarSizeInBits()/8);
-
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- MVT BitcastVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
- // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
- // are extracted by EXTRACT_SUBVECTOR.
- SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getBitcast(BitcastVT, Mask),
- DAG.getIntPtrConstant(0, dl));
-
- MachineMemOperand *MMO = DAG.getMachineFunction().
- getMachineMemOperand(MachinePointerInfo(),
- MachineMemOperand::MOStore, SVT.getStoreSize(),
- SVT.getScalarSizeInBits()/8);
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
+ EHInfo->EHGuardFrameIndex = FINode->getIndex();
- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
- VMask, SVT, MMO, true);
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
}
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -17433,6 +18389,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
if (!IntrData) {
if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
return MarkEHRegistrationNode(Op, DAG);
+ if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+ return MarkEHGuard(Op, DAG);
if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
@@ -17491,7 +18449,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
- Scale, Chain, *Subtarget);
+ Scale, Chain, Subtarget);
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
@@ -17504,7 +18462,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
- *Subtarget);
+ Subtarget);
}
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {
@@ -17532,7 +18490,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
}
// ADC/ADCX/SBB
case ADX: {
- SmallVector<SDValue, 2> Results;
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
@@ -17540,13 +18497,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
Op.getOperand(4), GenCF.getValue(1));
SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
- Op.getOperand(5), MachinePointerInfo(),
- false, false, 0);
+ Op.getOperand(5), MachinePointerInfo());
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86::COND_B, dl, MVT::i8),
Res.getValue(1));
- Results.push_back(SetCC);
- Results.push_back(Store);
+ SDValue Results[] = { SetCC, Store };
return DAG.getMergeValues(Results, dl);
}
case COMPRESS_TO_MEM: {
@@ -17554,48 +18509,45 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue DataToCompress = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
-
MVT VT = DataToCompress.getSimpleValueType();
+
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
if (isAllOnesConstant(Mask)) // return just a store
return DAG.getStore(Chain, dl, DataToCompress, Addr,
- MachinePointerInfo(), false, false,
- VT.getScalarSizeInBits()/8);
+ MemIntr->getMemOperand());
SDValue Compressed =
getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
Mask, DAG.getUNDEF(VT), Subtarget, DAG);
return DAG.getStore(Chain, dl, Compressed, Addr,
- MachinePointerInfo(), false, false,
- VT.getScalarSizeInBits()/8);
+ MemIntr->getMemOperand());
}
case TRUNCATE_TO_MEM_VI8:
- return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
case TRUNCATE_TO_MEM_VI16:
- return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
- case TRUNCATE_TO_MEM_VI32:
- return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
- case EXPAND_FROM_MEM: {
+ case TRUNCATE_TO_MEM_VI32: {
SDValue Mask = Op.getOperand(4);
- SDValue PassThru = Op.getOperand(3);
+ SDValue DataToTruncate = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
- MVT VT = Op.getSimpleValueType();
- if (isAllOnesConstant(Mask)) // return just a load
- return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
- false, VT.getScalarSizeInBits()/8);
+ MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+ assert(MemIntr && "Expected MemIntrinsicSDNode!");
- SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
- false, false, false,
- VT.getScalarSizeInBits()/8);
+ EVT VT = MemIntr->getMemoryVT();
- SDValue Results[] = {
- getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
- Mask, PassThru, Subtarget, DAG), Chain};
- return DAG.getMergeValues(Results, dl);
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
+ MemIntr->getMemOperand());
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
+ MemIntr->getMemOperand(), true);
}
- case LOADU:
- case LOADA: {
+ case EXPAND_FROM_MEM: {
SDValue Mask = Op.getOperand(4);
SDValue PassThru = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
@@ -17605,13 +18557,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
+ SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
+ MemIntr->getMemOperand());
+
if (isAllOnesConstant(Mask)) // return just a load
- return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+ return DataToExpand;
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
- MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
+ SDValue Results[] = {
+ getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
+ Mask, PassThru, Subtarget, DAG), Chain};
+ return DAG.getMergeValues(Results, dl);
}
}
}
@@ -17630,25 +18585,24 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
- DAG.getNode(ISD::ADD, dl, PtrVT,
- FrameAddr, Offset),
- MachinePointerInfo(), false, false, false, 0);
+ DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+ MachinePointerInfo());
}
// Just load the return address.
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
- return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
- RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+ MachinePointerInfo());
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo *MFI = MF.getFrameInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
EVT VT = Op.getValueType();
MFI->setFrameAddressIsTaken(true);
@@ -17678,8 +18632,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
- MachinePointerInfo(),
- false, false, false, 0);
+ MachinePointerInfo());
return FrameAddr;
}
@@ -17687,7 +18640,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// this table could be generated automatically from RegInfo.
unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const MachineFunction &MF = DAG.getMachineFunction();
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -17703,7 +18656,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
" is allocatable: function has no frame pointer");
#ifndef NDEBUG
else {
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
@@ -17720,23 +18673,27 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
unsigned X86TargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
- return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
- return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+ return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
}
unsigned X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
- return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+ return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+bool X86TargetLowering::needsFixedCatchObjects() const {
+ return Subtarget.isTargetWin64();
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -17746,7 +18703,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl (Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -17758,8 +18715,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(RegInfo->getSlotSize(),
dl));
StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
- Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
- false, false, 0);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
@@ -17769,6 +18725,16 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
+ // If the subtarget is not 64bit, we may need the global base reg
+ // after isel expand pseudo, i.e., after CGBR pass ran.
+ // Therefore, ask for the GlobalBaseReg now, so that the pass
+ // inserts the code for us in case we need it.
+ // Otherwise, we will end up in a situation where we will
+ // reference a virtual register that is not defined!
+ if (!Subtarget.is64Bit()) {
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
+ }
return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
DAG.getVTList(MVT::i32, MVT::Other),
Op.getOperand(0), Op.getOperand(1));
@@ -17781,6 +18747,13 @@ SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
}
+SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+ Op.getOperand(0));
+}
+
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
return Op.getOperand(0);
}
@@ -17794,9 +18767,9 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
SDValue OutChains[6];
// Large code-model.
@@ -17812,14 +18785,13 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
SDValue Addr = Trmp;
OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
- Addr, MachinePointerInfo(TrmpAddr),
- false, false, 0);
+ Addr, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(2, dl, MVT::i64));
- OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
- MachinePointerInfo(TrmpAddr, 2),
- false, false, 2);
+ OutChains[1] =
+ DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
+ /* Alignment = */ 2);
// Load the 'nest' parameter value into R10.
// R10 is specified in X86CallingConv.td
@@ -17827,29 +18799,26 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(10, dl, MVT::i64));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
- Addr, MachinePointerInfo(TrmpAddr, 10),
- false, false, 0);
+ Addr, MachinePointerInfo(TrmpAddr, 10));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(12, dl, MVT::i64));
- OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
- MachinePointerInfo(TrmpAddr, 12),
- false, false, 2);
+ OutChains[3] =
+ DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
+ /* Alignment = */ 2);
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(20, dl, MVT::i64));
OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
- Addr, MachinePointerInfo(TrmpAddr, 20),
- false, false, 0);
+ Addr, MachinePointerInfo(TrmpAddr, 20));
unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(22, dl, MVT::i64));
OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
- Addr, MachinePointerInfo(TrmpAddr, 22),
- false, false, 0);
+ Addr, MachinePointerInfo(TrmpAddr, 22));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
@@ -17909,29 +18878,28 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
- OutChains[0] = DAG.getStore(Root, dl,
- DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
- Trmp, MachinePointerInfo(TrmpAddr),
- false, false, 0);
+ OutChains[0] =
+ DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
+ Trmp, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(1, dl, MVT::i32));
- OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
- MachinePointerInfo(TrmpAddr, 1),
- false, false, 1);
+ OutChains[1] =
+ DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
+ /* Alignment = */ 1);
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(5, dl, MVT::i32));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
Addr, MachinePointerInfo(TrmpAddr, 5),
- false, false, 1);
+ /* Alignment = */ 1);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(6, dl, MVT::i32));
- OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
- MachinePointerInfo(TrmpAddr, 6),
- false, false, 1);
+ OutChains[3] =
+ DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
+ /* Alignment = */ 1);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
@@ -17959,7 +18927,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
@@ -17979,8 +18947,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
Ops, MVT::i16, MMO);
// Load FP Control Word from stack slot
- SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
- MachinePointerInfo(), false, false, false, 0);
+ SDValue CWD =
+ DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
// Transform as necessary
SDValue CWD1 =
@@ -18014,6 +18982,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
// split the vector, perform operation on it's Lo a Hi part and
// concatenate the results.
static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
@@ -18044,8 +19013,8 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
- Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
- Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+ Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
+ Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
}
@@ -18064,51 +19033,112 @@ static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
}
-static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- MVT OpVT = VT;
- unsigned NumBits = VT.getSizeInBits();
- SDLoc dl(Op);
+ int NumElts = VT.getVectorNumElements();
+ int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+ MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
- if (VT.isVector() && Subtarget->hasAVX512())
- return LowerVectorCTLZ_AVX512(Op, DAG);
+ // Per-nibble leading zero PSHUFB lookup table.
+ const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+ /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+ /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
- Op = Op.getOperand(0);
- if (VT == MVT::i8) {
- // Zero extend to i32 since there is not an i8 bsr.
- OpVT = MVT::i32;
- Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumBytes; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
+
+ // Begin by bitcasting the input to byte vector, then split those bytes
+ // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+ // If the hi input nibble is zero then we add both results together, otherwise
+ // we just take the hi result (by masking the lo result to zero before the
+ // add).
+ SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+ SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+ SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+ Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+ SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+ // Merge result back from vXi8 back to VT, working on the lo/hi halves
+ // of the current vector width in the same way we did for the nibbles.
+ // If the upper half of the input element is zero then add the halves'
+ // leading zero counts together, otherwise just use the upper half's.
+ // Double the width of the result until we are at target width.
+ while (CurrVT != VT) {
+ int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+ int CurrNumElts = CurrVT.getVectorNumElements();
+ MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+ MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+ SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+ // Check if the upper half of the input element is zero.
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getBitcast(NextVT, HiZ);
+
+ // Move the upper/lower halves to the lower bits as we'll be extending to
+ // NextVT. Mask the lower result to zero if HiZ is true and add the results
+ // together.
+ SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+ SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+ SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+ R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+ Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+ CurrVT = NextVT;
}
- // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
- SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
- Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+ return Res;
+}
- // If src is zero (i.e. bsr sets ZF), returns NumBits.
- SDValue Ops[] = {
- Op,
- DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
- DAG.getConstant(X86::COND_E, dl, MVT::i8),
- Op.getValue(1)
- };
- Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue Op0 = Op.getOperand(0);
- // Finally xor with NumBits-1.
- Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
- DAG.getConstant(NumBits - 1, dl, OpVT));
+ if (Subtarget.hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
- if (VT == MVT::i8)
- Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
- return Op;
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, perform ctlz and concat the result.
+ SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+ DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+ }
+
+ assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+ return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
}
-static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- EVT OpVT = VT;
+ MVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
+
+ if (VT.isVector())
+ return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
@@ -18117,11 +19147,22 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
}
- // Issue a bsr (scan bits in reverse).
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
- // And xor with NumBits-1.
+ if (Opc == ISD::CTLZ) {
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+ }
+
+ // Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
DAG.getConstant(NumBits - 1, dl, OpVT));
@@ -18136,8 +19177,6 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
if (VT.isVector()) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
SDValue N0 = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, dl, VT);
@@ -18146,8 +19185,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
// cttz_undef(x) = (width - 1) - ctlz(lsb)
- if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
- TLI.isOperationLegal(ISD::CTLZ, VT)) {
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
DAG.getNode(ISD::CTLZ, dl, VT, LSB));
@@ -18176,8 +19214,8 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
-// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
-// ones, and then concatenate the result back.
+/// Break a 256-bit integer operation into two new 128-bit ones and then
+/// concatenate the result back.
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -18189,13 +19227,42 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+ SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+ SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+/// Break a 512-bit integer operation into two new 256-bit ones and then
+/// concatenate the result back.
+static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ assert(VT.is512BitVector() && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
MVT EltVT = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -18232,7 +19299,7 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
return Lower256IntArith(Op, DAG);
}
-static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
@@ -18241,28 +19308,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
// Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget->hasInt256())
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntArith(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
- // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
- // pairs, multiply and truncate.
- if (VT == MVT::v16i8 || VT == MVT::v32i8) {
- if (Subtarget->hasInt256()) {
- if (VT == MVT::v32i8) {
- MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
- SDValue Lo = DAG.getIntPtrConstant(0, dl);
- SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
- SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
- SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
- SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
- SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
- DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
- }
+ // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
+ // vector pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
+ if (Subtarget.hasInt256()) {
+ // For 512-bit vectors, split into 256-bit vectors to allow the
+ // sign-extension to occur.
+ if (VT == MVT::v64i8)
+ return Lower512IntArith(Op, DAG);
+
+ // For 256-bit vectors, split into 128-bit vectors to allow the
+ // sign-extension to occur. We don't need this on AVX512BW as we can
+ // safely sign-extend to v32i16.
+ if (VT == MVT::v32i8 && !Subtarget.hasBWI())
+ return Lower256IntArith(Op, DAG);
MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
return DAG.getNode(
@@ -18278,7 +19343,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Extract the lo parts and sign extend to i16
SDValue ALo, BLo;
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
} else {
@@ -18294,7 +19359,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Extract the hi parts and sign extend to i16
SDValue AHi, BHi;
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
@@ -18322,7 +19387,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
if (VT == MVT::v4i32) {
- assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+ assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmuldq is available!");
// Extract the odd parts.
@@ -18386,8 +19451,122 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
+static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntArith(Op, DAG);
+
+ // Only i8 vectors should need custom lowering after this.
+ assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ "Unsupported vector type");
+
+ // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
+ // logical shift down the upper half and pack back to i8.
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
+ // and then ashr/lshr the upper bits down to the lower bits before multiply.
+ unsigned Opcode = Op.getOpcode();
+ unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
+ unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+
+ // AVX2 implementations - extend xmm subvectors to ymm.
+ if (Subtarget.hasInt256()) {
+ SDValue Lo = DAG.getIntPtrConstant(0, dl);
+ SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+
+ if (VT == MVT::v32i8) {
+ SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
+ SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
+ SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
+ SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
+ ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
+ BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
+ AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
+ BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
+ Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+ DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
+ DAG.getConstant(8, dl, MVT::v16i16));
+ Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+ DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
+ DAG.getConstant(8, dl, MVT::v16i16));
+ // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
+ // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
+ const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
+ 16, 17, 18, 19, 20, 21, 22, 23};
+ const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ 24, 25, 26, 27, 28, 29, 30, 31};
+ return DAG.getNode(X86ISD::PACKUS, dl, VT,
+ DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
+ DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
+ }
+
+ SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
+ SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
+ SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
+ DAG.getConstant(8, dl, MVT::v16i16));
+ Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
+ Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and zero/sign extend to i16.
+ SDValue ALo, BLo;
+ if (Subtarget.hasSSE41()) {
+ ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+ BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+ -1, 4, -1, 5, -1, 6, -1, 7};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
+ ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and zero/sign extend to i16.
+ SDValue AHi, BHi;
+ if (Subtarget.hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+ BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
+ -1, 12, -1, 13, -1, 14, -1, 15};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
+ AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+ // pack back to v16i8.
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
+ RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget->isTargetWin64() && "Unexpected target");
+ assert(Subtarget.isTargetWin64() && "Unexpected target");
EVT VT = Op.getValueType();
assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering");
@@ -18415,8 +19594,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
"Unexpected argument type for lowering");
SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
Entry.Node = StackPtr;
- InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
- false, false, 16);
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
+ MachinePointerInfo(), /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.isSExt = false;
@@ -18431,21 +19610,39 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
CLI.setDebugLoc(dl).setChain(InChain)
.setCallee(getLibcallCallingConv(LC),
static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
- Callee, std::move(Args), 0)
+ Callee, std::move(Args))
.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return DAG.getBitcast(VT, CallInfo.first);
}
-static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
MVT VT = Op0.getSimpleValueType();
SDLoc dl(Op);
- assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
- (VT == MVT::v8i32 && Subtarget->hasInt256()));
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned Opcode = Op.getOpcode();
+ unsigned NumElems = VT.getVectorNumElements();
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
+ SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
+ SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
+ SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
+ SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
+ SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
+ SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
+ SDValue Ops[] = {
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
+ };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
+ assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()));
// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
@@ -18461,16 +19658,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
// step to the left):
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
// <a|b|c|d> => <b|undef|d|undef>
- SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+ makeArrayRef(&Mask[0], VT.getVectorNumElements()));
// <e|f|g|h> => <f|undef|h|undef>
- SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+ makeArrayRef(&Mask[0], VT.getVectorNumElements()));
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
unsigned Opcode =
- (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
@@ -18494,7 +19693,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
// If we have a signed multiply but no PMULDQ fix up the high parts of a
// unsigned multiply.
- if (IsSigned && !Subtarget->hasSSE41()) {
+ if (IsSigned && !Subtarget.hasSSE41()) {
SDValue ShAmt = DAG.getConstant(
31, dl,
DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
@@ -18515,19 +19714,19 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
-static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
if (VT.getScalarSizeInBits() < 16)
return false;
if (VT.is512BitVector() &&
- (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
+ (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
return true;
bool LShift = VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget->hasInt256());
+ (VT.is256BitVector() && Subtarget.hasInt256());
- bool AShift = LShift && (Subtarget->hasVLX() ||
+ bool AShift = LShift && (Subtarget.hasVLX() ||
(VT != MVT::v2i64 && VT != MVT::v4i64));
return (Opcode == ISD::SRA) ? AShift : LShift;
}
@@ -18535,24 +19734,24 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
// The shift amount is a variable, but it is the same for all vector lanes.
// These instructions are defined together with shift-immediate.
static
-bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
}
// Return true if the required (according to Opcode) variable-shift form is
// natively supported by the Subtarget
-static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
- if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
+ if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
return false;
// vXi16 supported only on AVX-512, BWI
- if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
+ if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
return false;
- if (VT.is512BitVector() || Subtarget->hasVLX())
+ if (VT.is512BitVector() || Subtarget.hasVLX())
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -18561,7 +19760,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
}
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
@@ -18611,12 +19810,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
// i64 SRA needs to be performed as partial shifts.
- if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
- Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
+ if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+ Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 ||
- (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+ (Subtarget.hasInt256() && VT == MVT::v32i8) ||
VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -18628,11 +19827,16 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// ashr(R, 7) === cmp_slt(R, 0)
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ if (VT.is512BitVector()) {
+ assert(VT == MVT::v64i8 && "Unexpected element type!");
+ SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
+ }
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
}
// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
- if (VT == MVT::v16i8 && Subtarget->hasXOP())
+ if (VT == MVT::v16i8 && Subtarget.hasXOP())
return SDValue();
if (Op.getOpcode() == ISD::SHL) {
@@ -18668,8 +19872,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
- (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+ if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
+ (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
// Peek through any splat that was introduced for i64 shift vectorization.
int SplatIndex = -1;
@@ -18726,7 +19930,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget* Subtarget) {
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
@@ -18746,7 +19950,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
// Check if this build_vector node is doing a splat.
// If so, then set BaseShAmt equal to the splat value.
BaseShAmt = BV->getSplatValue();
- if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+ if (BaseShAmt && BaseShAmt.isUndef())
BaseShAmt = SDValue();
} else {
if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
@@ -18787,7 +19991,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget->is64Bit() && VT == MVT::v2i64 &&
+ if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
@@ -18808,15 +20012,16 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
+ bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
assert(VT.isVector() && "Custom lowering only for vector shifts!");
- assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
+ assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
@@ -18829,7 +20034,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// XOP has 128-bit variable logical/arithmetic shifts.
// +ve/-ve Amt = shift left/right.
- if (Subtarget->hasXOP() &&
+ if (Subtarget.hasXOP() &&
(VT == MVT::v2i64 || VT == MVT::v4i32 ||
VT == MVT::v8i16 || VT == MVT::v16i8)) {
if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
@@ -18856,7 +20061,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// i64 vector arithmetic shift can be emulated with the transform:
// M = lshr(SIGN_BIT, Amt)
// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
- if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+ if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
Op.getOpcode() == ISD::SRA) {
SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
@@ -18869,10 +20074,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
- if (Op.getOpcode() == ISD::SHL &&
+ if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
(VT == MVT::v8i16 || VT == MVT::v4i32 ||
- (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
- ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ (Subtarget.hasInt256() && VT == MVT::v16i16))) {
SmallVector<SDValue, 8> Elts;
MVT SVT = VT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();
@@ -18881,7 +20085,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
for (unsigned i=0; i !=NumElems; ++i) {
SDValue Op = Amt->getOperand(i);
- if (Op->getOpcode() == ISD::UNDEF) {
+ if (Op->isUndef()) {
Elts.push_back(Op);
continue;
}
@@ -18895,7 +20099,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+ SDValue BV = DAG.getBuildVector(VT, dl, Elts);
return DAG.getNode(ISD::MUL, dl, VT, R, BV);
}
@@ -18922,15 +20126,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
// the vector shift into four scalar shifts plus four pairs of vector
// insert/extract.
- if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
- ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
unsigned TargetOpcode = X86ISD::MOVSS;
bool CanBeSimplified;
// The splat value for the first packed shift (the 'X' from the example).
SDValue Amt1 = Amt->getOperand(0);
// The splat value for the second packed shift (the 'Y' from the example).
- SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
- Amt->getOperand(2);
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
// See if it is possible to replace this node with a sequence of
// two shifts followed by a MOVSS/MOVSD
@@ -18991,7 +20193,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
if (VT == MVT::v4i32) {
unsigned Opc = Op.getOpcode();
SDValue Amt0, Amt1, Amt2, Amt3;
- if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ if (ConstantAmt) {
Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
@@ -19031,14 +20233,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
if (VT == MVT::v16i8 ||
- (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
- if (Subtarget->hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
@@ -19141,7 +20343,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// It's worth extending once and using the v8i32 shifts for 16-bit types, but
// the extra overheads to get from v16i8 to v8i32 make the existing SSE
// solution better.
- if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+ if (Subtarget.hasInt256() && VT == MVT::v8i16) {
MVT ExtVT = MVT::v8i32;
unsigned ExtOpc =
Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
@@ -19151,13 +20353,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
}
- if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
+ if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
- SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
- SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
ALo = DAG.getBitcast(ExtVT, ALo);
AHi = DAG.getBitcast(ExtVT, AHi);
RLo = DAG.getBitcast(ExtVT, RLo);
@@ -19172,10 +20374,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
if (VT == MVT::v8i16) {
unsigned ShiftOpcode = Op->getOpcode();
+ // If we have a constant shift amount, the non-SSE41 path is best as
+ // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
+ bool UseSSE41 = Subtarget.hasSSE41() &&
+ !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
- if (Subtarget->hasSSE41()) {
+ if (UseSSE41) {
MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
@@ -19192,7 +20399,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
- if (Subtarget->hasSSE41()) {
+ if (UseSSE41) {
// On SSE41 targets we need to replicate the shift mask in both
// bytes for PBLENDVB.
Amt = DAG.getNode(
@@ -19231,43 +20438,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
// Decompose 256-bit shifts into smaller 128-bit shifts.
- if (VT.is256BitVector()) {
- unsigned NumElems = VT.getVectorNumElements();
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- // Extract the two vectors
- SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
- SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
-
- // Recreate the shift amount vectors
- SDValue Amt1, Amt2;
- if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
- // Constant shift amount
- SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
- ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
- ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
-
- Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
- Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
- } else {
- // Variable shift amount
- Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
- Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
- }
-
- // Issue new vector shifts for the smaller types
- V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
- V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
-
- // Concatenate the result back
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
- }
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
return SDValue();
}
-static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
@@ -19275,7 +20452,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
SDValue Amt = Op.getOperand(1);
assert(VT.isVector() && "Custom lowering only for vector rotates!");
- assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+ assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
@@ -19363,6 +20540,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(X86::COND_O, DL, MVT::i32),
SDValue(Sum.getNode(), 2));
+ if (N->getValueType(1) == MVT::i1) {
+ SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+ DAG.getValueType(MVT::i1));
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ }
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
}
@@ -19372,10 +20554,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
SDValue SetCC =
- DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(Cond, DL, MVT::i32),
SDValue(Sum.getNode(), 1));
+ if (N->getValueType(1) == MVT::i1) {
+ SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
+ DAG.getValueType(MVT::i1));
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+ }
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
@@ -19387,9 +20574,9 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget->hasCmpxchg16b();
+ return Subtarget.hasCmpxchg16b();
else
return false;
}
@@ -19409,7 +20596,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
@@ -19446,16 +20633,9 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
-static bool hasMFENCE(const X86Subtarget& Subtarget) {
- // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
- // no-sse2). There isn't any reason to disable it if the target processor
- // supports it.
- return Subtarget.hasSSE2() || Subtarget.is64Bit();
-}
-
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
@@ -19483,7 +20663,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
// lowered to just a load without a fence. A mfence flushes the store buffer,
// making the optimization clearly correct.
- // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+ // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
@@ -19492,7 +20672,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- if (!hasMFENCE(*Subtarget))
+ if (!Subtarget.hasMFence())
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
@@ -19512,7 +20692,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
return Loaded;
}
-static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
@@ -19522,8 +20702,9 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
- if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
- if (hasMFENCE(*Subtarget))
+ if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+ FenceScope == CrossThread) {
+ if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
@@ -19545,7 +20726,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
}
-static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT T = Op.getSimpleValueType();
SDLoc DL(Op);
@@ -19557,7 +20738,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
case MVT::i16: Reg = X86::AX; size = 2; break;
case MVT::i32: Reg = X86::EAX; size = 4; break;
case MVT::i64:
- assert(Subtarget->is64Bit() && "Node not type legal!");
+ assert(Subtarget.is64Bit() && "Node not type legal!");
Reg = X86::RAX; size = 8;
break;
}
@@ -19587,14 +20768,14 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
return SDValue();
}
-static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT SrcVT = Op.getOperand(0).getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
SrcVT == MVT::i64) {
- assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (DstVT != MVT::f64)
// This conversion needs to be expanded.
return SDValue();
@@ -19614,7 +20795,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
DAG.getIntPtrConstant(i, dl)));
} else {
- assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() &&
+ assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST");
Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
DAG.getIntPtrConstant(0, dl)));
@@ -19627,14 +20808,14 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
Elts.append(NumElts, DAG.getUNDEF(SVT));
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+ SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
DAG.getIntPtrConstant(0, dl));
}
- assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
- Subtarget->hasMMX() && "Unexpected custom BITCAST");
+ assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
+ Subtarget.hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
"Unexpected custom BITCAST");
@@ -19657,12 +20838,11 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
/// how many bytes of V are summed horizontally to produce each element of the
/// result.
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
- const X86Subtarget *Subtarget,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(V);
MVT ByteVecVT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
- int NumElts = VT.getVectorNumElements();
assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
"Expected value to have byte element type.");
assert(EltVT != MVT::i8 &&
@@ -19713,16 +20893,15 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
// right by 8. It is important to shift as i16s as i8 vector shift isn't
// directly supported.
- SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
- SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+ SDValue ShifterV = DAG.getConstant(8, DL, VT);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
DAG.getBitcast(ByteVecVT, V));
- return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
}
-static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
- const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
@@ -19750,17 +20929,14 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
int NumByteElts = VecSize / 8;
MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
SDValue In = DAG.getBitcast(ByteVecVT, Op);
- SmallVector<SDValue, 16> LUTVec;
+ SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumByteElts; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
- SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
- SmallVector<SDValue, 16> Mask0F(NumByteElts,
- DAG.getConstant(0x0F, DL, MVT::i8));
- SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+ SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
+ SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
// High nibbles
- SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
- SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+ SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
// Low nibbles
@@ -19781,8 +20957,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
}
-static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
- const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.is128BitVector() &&
@@ -19801,19 +20977,13 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
MVT VT = V.getSimpleValueType();
- SmallVector<SDValue, 32> Shifters(
- VT.getVectorNumElements(),
- DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
- return DAG.getNode(OpCode, DL, VT, V,
- DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+ SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
+ return DAG.getNode(OpCode, DL, VT, V, ShifterV);
};
auto GetMask = [&](SDValue V, APInt Mask) {
MVT VT = V.getSimpleValueType();
- SmallVector<SDValue, 32> Masks(
- VT.getVectorNumElements(),
- DAG.getConstant(Mask, DL, VT.getVectorElementType()));
- return DAG.getNode(ISD::AND, DL, VT, V,
- DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+ SDValue MaskV = DAG.getConstant(Mask, DL, VT);
+ return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
};
// We don't want to incur the implicit masks required to SRL vNi8 vectors on
@@ -19852,27 +21022,38 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
DAG);
}
-static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- // FIXME: Need to add AVX-512 support here!
- assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
"Unknown CTPOP type to handle");
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);
- if (!Subtarget->hasSSSE3()) {
+ if (!Subtarget.hasSSSE3()) {
// We can't use the fast LUT approach, so fall back on vectorized bitmath.
assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
}
- if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
unsigned NumElems = VT.getVectorNumElements();
// Extract each 128-bit vector, compute pop count and concat the result.
- SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
- SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+ SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+ LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+ }
+
+ if (VT.is512BitVector() && !Subtarget.hasBWI()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 256-bit vector, compute pop count and concat the result.
+ SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
@@ -19882,26 +21063,184 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().isVector() &&
"We only do custom lowering for vector population count.");
return LowerVectorCTPOP(Op, Subtarget, DAG);
}
-static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
- EVT T = Node->getValueType(0);
- SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
- DAG.getConstant(0, dl, T), Node->getOperand(2));
- return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
- cast<AtomicSDNode>(Node)->getMemoryVT(),
- Node->getOperand(0),
- Node->getOperand(1), negOp,
- cast<AtomicSDNode>(Node)->getMemOperand(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getSynchScope());
+static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ // For scalars, its still beneficial to transfer to/from the SIMD unit to
+ // perform the BITREVERSE.
+ if (!VT.isVector()) {
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
+ Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ MVT SVT = VT.getVectorElementType();
+ int NumElts = VT.getVectorNumElements();
+ int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector()) {
+ SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+
+ MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
+ DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
+ }
+
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitreverse lowering supported.");
+
+ // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
+ // perform the BSWAP in the shuffle.
+ // Its best to shuffle using the second operand as this will implicitly allow
+ // memory folding for multiple vectors.
+ SmallVector<SDValue, 16> MaskElts;
+ for (int i = 0; i != NumElts; ++i) {
+ for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
+ int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
+ int PermuteByte = SourceByte | (2 << 5);
+ MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
+ }
+ }
+
+ SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
+ SDValue Res = DAG.getBitcast(MVT::v16i8, In);
+ Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
+ Res, Mask);
+ return DAG.getBitcast(VT, Res);
+}
+
+static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (Subtarget.hasXOP())
+ return LowerBITREVERSE_XOP(Op, DAG);
+
+ assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ SDLoc DL(Op);
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
+ // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
+ SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+ Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
+ Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
+ // two nibbles and a PSHUFB lookup to find the bitreverse of each
+ // 0-15 value (moved to the other nibble).
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
+
+ const int LoLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
+ /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
+ /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
+ /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
+ const int HiLUT[16] = {
+ /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
+ /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
+ /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
+ /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
+
+ SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
+ HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
+ }
+
+ SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
+ SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+}
+
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
+ unsigned NewOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::ATOMIC_LOAD_ADD:
+ NewOpc = X86ISD::LADD;
+ break;
+ case ISD::ATOMIC_LOAD_SUB:
+ NewOpc = X86ISD::LSUB;
+ break;
+ case ISD::ATOMIC_LOAD_OR:
+ NewOpc = X86ISD::LOR;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ NewOpc = X86ISD::LXOR;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ NewOpc = X86ISD::LAND;
+ break;
+ default:
+ llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
+ }
+
+ MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+ return DAG.getMemIntrinsicNode(
+ NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
+ {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
+ /*MemVT=*/N->getSimpleValueType(0), MMO);
+}
+
+/// Lower atomic_load_ops into LOCK-prefixed operations.
+static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Chain = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ unsigned Opc = N->getOpcode();
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
+ // can only be lowered when the result is unused. They should have already
+ // been transformed into a cmpxchg loop in AtomicExpand.
+ if (N->hasAnyUseOfValue(0)) {
+ // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
+ // select LXADD if LOCK_SUB can't be selected.
+ if (Opc == ISD::ATOMIC_LOAD_SUB) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
+ RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+ RHS, AN->getMemOperand(), AN->getOrdering(),
+ AN->getSynchScope());
+ }
+ assert(Opc == ISD::ATOMIC_LOAD_ADD &&
+ "Used AtomicRMW ops other than Add should have been expanded!");
+ return N;
+ }
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
+ // RAUW the chain, but don't worry about the result, as it's unused.
+ assert(!N->hasAnyUseOfValue(0));
+ DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
+ return SDValue();
}
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
@@ -19914,7 +21253,8 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
// FIXME: On 32-bit, store -> fist or movq would be more efficient
// (The only way to get a 16-byte store is cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
- if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+ if (cast<AtomicSDNode>(Node)->getOrdering() ==
+ AtomicOrdering::SequentiallyConsistent ||
!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
cast<AtomicSDNode>(Node)->getMemoryVT(),
@@ -19955,9 +21295,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1), Op.getOperand(2));
}
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+ assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
// which returns the values as { float, float } (in XMM0) or
@@ -19991,7 +21331,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+ .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -20051,7 +21391,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
DAG.getUNDEF(EltVT);
for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
Ops.push_back(FillVal);
- return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+ return DAG.getBuildVector(NVT, dl, Ops);
}
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
DAG.getUNDEF(NVT);
@@ -20059,9 +21399,9 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
InOp, DAG.getIntPtrConstant(0, dl));
}
-static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget->hasAVX512() &&
+ assert(Subtarget.hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
// X86 scatter kills mask register, so its type should be added to
@@ -20110,7 +21450,7 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
}
unsigned NumElts = VT.getVectorNumElements();
- if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should
// be 512 bit wide. If now the both index and data are 256-bit, but
@@ -20150,68 +21490,78 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
N->getMemOperand());
DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
- return SDValue(NewScatter.getNode(), 0);
+ return SDValue(NewScatter.getNode(), 1);
}
-static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
SDLoc dl(Op);
- if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
- !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
- // This operation is legal for targets with VLX, but without
- // VLX the vector should be widened to 512 bit
- unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
- MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
- MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
- SDValue Src0 = N->getSrc0();
- Src0 = ExtendToType(Src0, WideDataVT, DAG);
- Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
- N->getBasePtr(), Mask, Src0,
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType());
-
- SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewLoad.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
- return DAG.getMergeValues(RetOps, dl);
- }
- return Op;
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked load op.");
+
+ assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked load op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ SDValue Src0 = N->getSrc0();
+ Src0 = ExtendToType(Src0, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+ N->getBasePtr(), Mask, Src0,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType());
+
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
}
-static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
SDValue DataToStore = N->getValue();
MVT VT = DataToStore.getSimpleValueType();
+ MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
SDLoc dl(Op);
- if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
- !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
- // This operation is legal for targets with VLX, but without
- // VLX the vector should be widened to 512 bit
- unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
- MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
- MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
- DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
- Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
- Mask, N->getMemoryVT(), N->getMemOperand(),
- N->isTruncatingStore());
- }
- return Op;
+ assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+ "Cannot lower masked store op.");
+
+ assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+ (Subtarget.hasBWI() &&
+ (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ "Unsupported masked store op.");
+
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->isTruncatingStore());
}
-static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget->hasAVX512() &&
+ assert(Subtarget.hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
@@ -20226,7 +21576,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
- if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should
// be 512 bit wide. If now the both index and data are 256-bit, but
@@ -20314,8 +21664,7 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
return NOOP;
}
-/// LowerOperation - Provide custom lowering hooks for some operations.
-///
+/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
@@ -20323,8 +21672,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
- case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
- case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
@@ -20377,14 +21731,18 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH:
+ return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
- case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG);
- case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::MULHS:
+ case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
case ISD::UMUL_LOHI:
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
@@ -20417,11 +21775,34 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::GC_TRANSITION_START:
return LowerGC_TRANSITION_START(Op, DAG);
case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
+ case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
}
}
-/// ReplaceNodeResults - Replace a node with an illegal result type
-/// with a new node built out of custom code.
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void X86TargetLowering::LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+ if (!Res.getNode())
+ return;
+
+ assert((N->getNumValues() <= Res->getNumValues()) &&
+ "Lowering returned the wrong number of results!");
+
+ // Places new result values base on N result number.
+ // In some cases (LowerSINT_TO_FP for example) Res has more result values
+ // than original node, chain should be dropped(last value).
+ for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
+}
+
+/// Replace a node with an illegal result type with a new node built out of
+/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {
@@ -20432,15 +21813,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
llvm_unreachable("Do not know how to custom type legalize this operation!");
case X86ISD::AVG: {
// Legalize types for X86ISD::AVG by expanding vectors.
- assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
auto InVT = N->getValueType(0);
auto InVTSize = InVT.getSizeInBits();
const unsigned RegSize =
(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
- assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+ assert((!Subtarget.hasAVX512() || RegSize < 512) &&
"512-bit vector requires AVX512");
- assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+ assert((!Subtarget.hasAVX2() || RegSize < 256) &&
"256-bit vector requires AVX2");
auto ElemVT = InVT.getVectorElementType();
@@ -20503,24 +21884,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
if (StackSlot.getNode())
- Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
- MachinePointerInfo(),
- false, false, false, 0));
+ Results.push_back(
+ DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
else
Results.push_back(FIST);
}
return;
}
case ISD::UINT_TO_FP: {
- assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (N->getOperand(0).getValueType() != MVT::v2i32 ||
N->getValueType(0) != MVT::v2f32)
return;
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
N->getOperand(0));
- SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
- MVT::f64);
- SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
@@ -20588,20 +21967,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getConstant(0, dl, HalfT));
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(1, dl, HalfT));
- swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
- Regs64bit ? X86::RBX : X86::EBX,
- swapInL, cpInH.getValue(1));
- swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
- Regs64bit ? X86::RCX : X86::ECX,
- swapInH, swapInL.getValue(1));
- SDValue Ops[] = { swapInH.getValue(0),
- N->getOperand(1),
- swapInH.getValue(1) };
+ swapInH =
+ DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, cpInH.getValue(1));
+ // If the current function needs the base pointer, RBX,
+ // we shouldn't use cmpxchg directly.
+ // Indeed the lowering of that instruction will clobber
+ // that register and since RBX will be a reserved register
+ // the register allocator will not make sure its value will
+ // be properly saved and restored around this live-range.
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
- unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
- X86ISD::LCMPXCHG8_DAG;
- SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+ (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+ // ISel prefers the LCMPXCHG64 variant.
+ // If that assert breaks, that means it is not the case anymore,
+ // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
+ // not just EBX. This is a matter of accepting i64 input for that
+ // pseudo, and restoring into the register of the right wide
+ // in expand pseudo. Everything else should just work.
+ assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
+ "Saving only half of the RBX");
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
+ : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
+ SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ HalfT, swapInH.getValue(1));
+ SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
+ RBXSave,
+ /*Glue*/ RBXSave.getValue(2)};
+ Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ } else {
+ unsigned Opcode =
+ Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
+ swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX, swapInL,
+ swapInH.getValue(1));
+ SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+ swapInL.getValue(1)};
+ Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ }
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -20639,7 +22047,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
break;
}
case ISD::BITCAST: {
- assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
EVT SrcVT = N->getOperand(0)->getValueType(0);
@@ -20666,7 +22074,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
ToVecInt, DAG.getIntPtrConstant(i, dl)));
- Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+ Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
}
}
}
@@ -20703,7 +22111,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
- case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -20724,7 +22131,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
- case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
case X86ISD::ADDUS: return "X86ISD::ADDUS";
@@ -20742,7 +22148,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
@@ -20750,6 +22158,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
+ case X86ISD::EH_SJLJ_SETUP_DISPATCH:
+ return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
@@ -20757,6 +22167,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
+ case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
+ return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
+ case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
+ return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
+ case X86ISD::LADD: return "X86ISD::LADD";
+ case X86ISD::LSUB: return "X86ISD::LSUB";
+ case X86ISD::LOR: return "X86ISD::LOR";
+ case X86ISD::LXOR: return "X86ISD::LXOR";
+ case X86ISD::LAND: return "X86ISD::LAND";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VZEXT: return "X86ISD::VZEXT";
@@ -20778,8 +22197,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VSHLI: return "X86ISD::VSHLI";
case X86ISD::VSRLI: return "X86ISD::VSRLI";
case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::VSRAV: return "X86ISD::VSRAV";
case X86ISD::VROTLI: return "X86ISD::VROTLI";
case X86ISD::VROTRI: return "X86ISD::VROTRI";
+ case X86ISD::VPPERM: return "X86ISD::VPPERM";
case X86ISD::CMPP: return "X86ISD::CMPP";
case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
@@ -20802,6 +22223,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::AND: return "X86ISD::AND";
case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
case X86ISD::TESTM: return "X86ISD::TESTM";
@@ -20842,6 +22264,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
@@ -20852,8 +22275,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
case X86ISD::MFENCE: return "X86ISD::MFENCE";
- case X86ISD::SFENCE: return "X86ISD::SFENCE";
- case X86ISD::LFENCE: return "X86ISD::LFENCE";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
@@ -20866,6 +22287,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPSHL: return "X86ISD::VPSHL";
case X86ISD::VPCOM: return "X86ISD::VPCOM";
case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
+ case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
@@ -20878,6 +22300,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
+ case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
@@ -20898,6 +22322,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
case X86ISD::ADDS: return "X86ISD::ADDS";
case X86ISD::SUBS: return "X86ISD::SUBS";
case X86ISD::AVG: return "X86ISD::AVG";
@@ -20908,26 +22333,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
+ case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
+ case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
+ case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
}
return nullptr;
}
-// isLegalAddressingMode - Return true if the addressing mode represented
-// by AM is legal for this target, for a load/store of the specified type.
+/// Return true if the addressing mode represented by AM is legal for this
+/// target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
- Reloc::Model R = getTargetMachine().getRelocationModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
- unsigned GVFlags =
- Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+ unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
// If a reference to this global requires an extra load, we can't fold it.
if (isGlobalStubReference(GVFlags))
@@ -20939,8 +22365,8 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
// If lower 4G is not available, then we must use rip-relative addressing.
- if ((M != CodeModel::Small || R != Reloc::Static) &&
- Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ if ((M != CodeModel::Small || isPositionIndependent()) &&
+ Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
return false;
}
@@ -20977,7 +22403,7 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
// On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
// variable shifts just as cheap as scalar ones.
- if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+ if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
return false;
// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
@@ -21026,12 +22452,12 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
- return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
}
bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
- return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
}
bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -21062,7 +22488,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
- if (!Subtarget->hasAnyFMA())
+ if (!Subtarget.hasAnyFMA())
return false;
VT = VT.getScalarType();
@@ -21086,8 +22512,8 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
}
-/// isShuffleMaskLegal - Targets can use this to indicate that they only
-/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// Targets can use this to indicate that they only support *some*
+/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
bool
@@ -21121,9 +22547,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
//===----------------------------------------------------------------------===//
/// Utility function to emit xbegin specifying the start of an RTM region.
-static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
@@ -21167,21 +22593,21 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
// sinkMBB:
// EAX is live into the sinkMBB
sinkMBB->addLiveIn(X86::EAX);
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(X86::EAX);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return sinkMBB;
}
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
-static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
const TargetInstrInfo *TII) {
unsigned Opc;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
@@ -21193,32 +22619,31 @@ static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
}
- DebugLoc dl = MI->getDebugLoc();
+ DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
- unsigned NumArgs = MI->getNumOperands();
+ unsigned NumArgs = MI.getNumOperands();
for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI->getOperand(i);
+ MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
- if (MI->hasOneMemOperand())
- MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ if (MI.hasOneMemOperand())
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
- BuildMI(*BB, MI, dl,
- TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::XMM0);
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::XMM0);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return BB;
}
// FIXME: Custom handling because TableGen doesn't support multiple implicit
// defs in an instruction pattern
-static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
const TargetInstrInfo *TII) {
unsigned Opc;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
@@ -21230,93 +22655,90 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
}
- DebugLoc dl = MI->getDebugLoc();
+ DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
- unsigned NumArgs = MI->getNumOperands(); // remove the results
+ unsigned NumArgs = MI.getNumOperands(); // remove the results
for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI->getOperand(i);
+ MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
- if (MI->hasOneMemOperand())
- MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ if (MI.hasOneMemOperand())
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
- BuildMI(*BB, MI, dl,
- TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::ECX);
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::ECX);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return BB;
}
-static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget *Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// insert input VAL into EAX
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI->getOperand(0).getReg());
+ .addReg(MI.getOperand(0).getReg());
// insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
- .addReg(X86::ECX)
- .addReg(X86::ECX);
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
// insert zero to EDX
- BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
- .addReg(X86::EDX)
- .addReg(X86::EDX);
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
+
// insert WRPKRU instruction
BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
- MI->eraseFromParent(); // The pseudo is gone now.
+ MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
-static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget *Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
- .addReg(X86::ECX)
- .addReg(X86::ECX);
+ BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
// insert RDPKRU instruction
BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::EAX);
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(X86::EAX);
- MI->eraseFromParent(); // The pseudo is gone now.
+ MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
-static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget *Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget,
+ unsigned Opc) {
+ DebugLoc dl = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
- unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+ unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(i));
+ MIB.addOperand(MI.getOperand(i));
unsigned ValOps = X86::AddrNumOperands;
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI->getOperand(ValOps).getReg());
+ .addReg(MI.getOperand(ValOps).getReg());
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
- .addReg(MI->getOperand(ValOps+1).getReg());
+ .addReg(MI.getOperand(ValOps + 1).getReg());
// The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+ BuildMI(*BB, MI, dl, TII->get(Opc));
- MI->eraseFromParent(); // The pseudo is gone now.
+ MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
@@ -21328,31 +22750,31 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
// 8 ) Align : Alignment of type
// 9 ) EFLAGS (implicit-def)
- assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+ assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
static_assert(X86::AddrNumOperands == 5,
"VAARG_64 assumes 5 address operands");
- unsigned DestReg = MI->getOperand(0).getReg();
- MachineOperand &Base = MI->getOperand(1);
- MachineOperand &Scale = MI->getOperand(2);
- MachineOperand &Index = MI->getOperand(3);
- MachineOperand &Disp = MI->getOperand(4);
- MachineOperand &Segment = MI->getOperand(5);
- unsigned ArgSize = MI->getOperand(6).getImm();
- unsigned ArgMode = MI->getOperand(7).getImm();
- unsigned Align = MI->getOperand(8).getImm();
+ unsigned DestReg = MI.getOperand(0).getReg();
+ MachineOperand &Base = MI.getOperand(1);
+ MachineOperand &Scale = MI.getOperand(2);
+ MachineOperand &Index = MI.getOperand(3);
+ MachineOperand &Disp = MI.getOperand(4);
+ MachineOperand &Segment = MI.getOperand(5);
+ unsigned ArgSize = MI.getOperand(6).getImm();
+ unsigned ArgMode = MI.getOperand(7).getImm();
+ unsigned Align = MI.getOperand(8).getImm();
// Memory Reference
- assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
- MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
- MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+ assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
// struct va_list {
// i32 gp_offset
@@ -21521,7 +22943,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
// to OverflowDestReg.
if (NeedsAlign) {
// Align the overflow address
- assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+ assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
@@ -21563,15 +22985,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
}
// Erase the pseudo instruction
- MI->eraseFromParent();
+ MI.eraseFromParent();
return endMBB;
}
-MachineBasicBlock *
-X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *MBB) const {
// Emit code to save XMM registers to the stack. The ABI says that the
// number of registers to save is given in %al, so it's theoretically
// possible to do an indirect jump trick to avoid saving all of them,
@@ -21602,14 +23022,14 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
- unsigned CountReg = MI->getOperand(0).getReg();
- int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
- int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+ unsigned CountReg = MI.getOperand(0).getReg();
+ int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
- if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
+ if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -21618,29 +23038,29 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
// Make sure the last operand is EFLAGS, which gets clobbered by the branch
// that was just emitted, but clearly shouldn't be "saved".
- assert((MI->getNumOperands() <= 3 ||
- !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
- MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
- && "Expected last argument to be EFLAGS");
- unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ assert((MI.getNumOperands() <= 3 ||
+ !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
+ MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+ "Expected last argument to be EFLAGS");
+ unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
- for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+ for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
MachineMemOperand *MMO = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
/*Size=*/16, /*Align=*/16);
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
- .addFrameIndex(RegSaveFrameIndex)
- .addImm(/*Scale=*/1)
- .addReg(/*IndexReg=*/0)
- .addImm(/*Disp=*/Offset)
- .addReg(/*Segment=*/0)
- .addReg(MI->getOperand(i).getReg())
- .addMemOperand(MMO);
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI.getOperand(i).getReg())
+ .addMemOperand(MMO);
}
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
return EndMBB;
}
@@ -21684,8 +23104,8 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
// together with other CMOV pseudo-opcodes into a single basic-block with
// conditional jump around it.
-static bool isCMOVPseudo(MachineInstr *MI) {
- switch (MI->getOpcode()) {
+static bool isCMOVPseudo(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
case X86::CMOV_FR32:
case X86::CMOV_FR64:
case X86::CMOV_GR8:
@@ -21715,10 +23135,10 @@ static bool isCMOVPseudo(MachineInstr *MI) {
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
@@ -21837,8 +23257,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// retq
//
MachineInstr *CascadedCMOV = nullptr;
- MachineInstr *LastCMOV = MI;
- X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+ MachineInstr *LastCMOV = &MI;
+ X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator NextMIIt =
std::next(MachineBasicBlock::iterator(MI));
@@ -21849,8 +23269,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
if (isCMOVPseudo(MI)) {
// See if we have a string of CMOVS with the same condition.
- while (NextMIIt != BB->end() &&
- isCMOVPseudo(NextMIIt) &&
+ while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
@@ -21860,10 +23279,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// This checks for case 2, but only do this if we didn't already find
// case 1, as indicated by LastCMOV == MI.
- if (LastCMOV == MI &&
- NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
- NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
- NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() &&
+ if (LastCMOV == &MI && NextMIIt != BB->end() &&
+ NextMIIt->getOpcode() == MI.getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
NextMIIt->getOperand(1).isKill()) {
CascadedCMOV = &*NextMIIt;
}
@@ -21885,7 +23304,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
@@ -21976,12 +23395,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If we have a cascaded CMOV, the second Jcc provides the same incoming
// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
if (CascadedCMOV) {
- MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+ MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
DL, TII->get(TargetOpcode::COPY),
CascadedCMOV->getOperand(0).getReg())
- .addReg(MI->getOperand(0).getReg());
+ .addReg(MI.getOperand(0).getReg());
CascadedCMOV->eraseFromParent();
}
@@ -21993,7 +23412,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Combine the following atomic floating-point modification pattern:
// a.store(reg OP a.load(acquire), release)
@@ -22002,52 +23421,55 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
// movss %xmm, (%gpr)
// Or sd equivalent for 64-bit operations.
unsigned MOp, FOp;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
- case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
- case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+ case X86::RELEASE_FADD32mr:
+ FOp = X86::ADDSSrm;
+ MOp = X86::MOVSSmr;
+ break;
+ case X86::RELEASE_FADD64mr:
+ FOp = X86::ADDSDrm;
+ MOp = X86::MOVSDmr;
+ break;
}
- const X86InstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- MachineOperand MSrc = MI->getOperand(0);
- unsigned VSrc = MI->getOperand(5).getReg();
- const MachineOperand &Disp = MI->getOperand(3);
- MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
- bool hasDisp = Disp.isGlobal() || Disp.isImm();
- if (hasDisp && MSrc.isReg())
- MSrc.setIsKill(false);
- MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
- .addOperand(/*Base=*/MSrc)
- .addImm(/*Scale=*/1)
- .addReg(/*Index=*/0)
- .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
- .addReg(0);
- MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
- MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
- .addReg(VSrc)
- .addOperand(/*Base=*/MSrc)
- .addImm(/*Scale=*/1)
- .addReg(/*Index=*/0)
- .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
- .addReg(/*Segment=*/0);
- MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ unsigned ValOpIdx = X86::AddrNumOperands;
+ unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(FOp),
+ MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+ .addReg(VSrc);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand &Operand = MI.getOperand(i);
+ // Clear any kill flags on register operands as we'll create a second
+ // instruction using the same address operands.
+ if (Operand.isReg())
+ Operand.setIsKill(false);
+ MIB.addOperand(Operand);
+ }
+ MachineInstr *FOpMI = MIB;
+ MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI.getOperand(i));
+ MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
assert(MF->shouldSplitStack());
- const bool Is64Bit = Subtarget->is64Bit();
- const bool IsLP64 = Subtarget->isTarget64BitLP64();
+ const bool Is64Bit = Subtarget.is64Bit();
+ const bool IsLP64 = Subtarget.isTarget64BitLP64();
const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
@@ -22077,11 +23499,12 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
getRegClassFor(getPointerTy(MF->getDataLayout()));
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
- bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
- tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
- SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
- sizeVReg = MI->getOperand(1).getReg(),
- physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI.getOperand(1).getReg(),
+ physSPReg =
+ IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = ++BB->getIterator();
@@ -22113,7 +23536,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
- Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
@@ -22156,43 +23579,33 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
// Take care of the PHI nodes.
BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
- MI->getOperand(0).getReg())
- .addReg(mallocPtrVReg).addMBB(mallocMBB)
- .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+ MI.getOperand(0).getReg())
+ .addReg(mallocPtrVReg)
+ .addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg)
+ .addMBB(bumpMBB);
// Delete the original pseudo instruction.
- MI->eraseFromParent();
+ MI.eraseFromParent();
// And we're done.
return continueMBB;
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
- MachineBasicBlock *BB) const {
- assert(!Subtarget->isTargetMachO());
- DebugLoc DL = MI->getDebugLoc();
- MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
- *BB->getParent(), *BB, MI, DL, false);
- MachineBasicBlock *ResumeBB = ResumeMI->getParent();
- MI->eraseFromParent(); // The pseudo instruction is gone now.
- return ResumeBB;
-}
-
-MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+ DebugLoc DL = MI.getDebugLoc();
assert(!isAsynchronousEHPersonality(
classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
"SEH does not use catchret!");
// Only 32-bit EH needs to worry about manually restoring stack pointers.
- if (!Subtarget->is32Bit())
+ if (!Subtarget.is32Bit())
return BB;
// C++ EH creates a new target block to hold the restore code, and wires up
@@ -22203,7 +23616,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
MF->insert(std::next(BB->getIterator()), RestoreMBB);
RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(RestoreMBB);
- MI->getOperand(0).setMBB(RestoreMBB);
+ MI.getOperand(0).setMBB(RestoreMBB);
auto RestoreMBBI = RestoreMBB->begin();
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
@@ -22212,37 +23625,37 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const Constant *PerFn = MF->getFunction()->getPersonalityFn();
bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
// Only 32-bit SEH requires special handling for catchpad.
- if (IsSEH && Subtarget->is32Bit()) {
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ if (IsSEH && Subtarget.is32Bit()) {
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
}
- MI->eraseFromParent();
+ MI.eraseFromParent();
return BB;
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const {
// So, here we replace TLSADDR with the sequence:
// adjust_stackdown -> TLSADDR -> adjust_stackup.
// We need this because TLSADDR is lowered into calls
// inside MC, therefore without the two markers shrink-wrapping
// may push the prologue/epilogue pass them.
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction &MF = *BB->getParent();
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
- BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0);
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
@@ -22257,86 +23670,89 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const {
// This is pretty easy. We're taking the value that we received from
// our load from the relocation, sticking it in either RDI (x86-64)
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
- assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
- assert(MI->getOperand(3).isGlobal() && "This should be a global");
+ assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
+ assert(MI.getOperand(3).isGlobal() && "This should be a global");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
- Subtarget->is64Bit() ?
- Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
- Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
- if (Subtarget->is64Bit()) {
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
- TII->get(X86::MOV64rm), X86::RDI)
- .addReg(X86::RIP)
- .addImm(0).addReg(0)
- .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
- MI->getOperand(3).getTargetFlags())
- .addReg(0);
+ Subtarget.is64Bit() ?
+ Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+ Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+ if (Subtarget.is64Bit()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
addDirectMem(MIB, X86::RDI);
MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
- } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
- TII->get(X86::MOV32rm), X86::EAX)
- .addReg(0)
- .addImm(0).addReg(0)
- .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
- MI->getOperand(3).getTargetFlags())
- .addReg(0);
+ } else if (!isPositionIndependent()) {
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else {
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
- TII->get(X86::MOV32rm), X86::EAX)
- .addReg(TII->getGlobalBaseReg(F))
- .addImm(0).addReg(0)
- .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
- MI->getOperand(3).getTargetFlags())
- .addReg(0);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+ .addReg(TII->getGlobalBaseReg(F))
+ .addImm(0)
+ .addReg(0)
+ .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+ MI.getOperand(3).getTargetFlags())
+ .addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
}
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
MachineBasicBlock *
-X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
- MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
- MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
unsigned DstReg;
unsigned MemOpndSlot = 0;
unsigned CurOp = 0;
- DstReg = MI->getOperand(CurOp++).getReg();
+ DstReg = MI.getOperand(CurOp++).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(RC->hasType(MVT::i32) && "Invalid destination!");
unsigned mainDstReg = MRI.createVirtualRegister(RC);
@@ -22384,16 +23800,15 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
unsigned PtrStoreOpc = 0;
unsigned LabelReg = 0;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
- Reloc::Model RM = MF->getTarget().getRelocationModel();
bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
- (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+ !isPositionIndependent();
// Prepare IP either in reg or imm.
if (!UseImmLabel) {
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
LabelReg = MRI.createVirtualRegister(PtrRC);
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
.addReg(X86::RIP)
.addImm(0)
@@ -22406,7 +23821,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
.addReg(XII->getGlobalBaseReg(MF))
.addImm(0)
.addReg(0)
- .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+ .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
.addReg(0);
}
} else
@@ -22415,9 +23830,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
- MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+ MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
else
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ MIB.addOperand(MI.getOperand(MemOpndSlot + i));
}
if (!UseImmLabel)
MIB.addReg(LabelReg);
@@ -22428,7 +23843,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
@@ -22447,7 +23862,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
const bool Uses64BitFramePtr =
- Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
@@ -22461,21 +23876,21 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return sinkMBB;
}
MachineBasicBlock *
-X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
- MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
- MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+ MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
@@ -22485,7 +23900,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
@@ -22500,41 +23915,275 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
// Reload FP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(i));
+ MIB.addOperand(MI.getOperand(i));
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload IP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
- MIB.addDisp(MI->getOperand(i), LabelOffset);
+ MIB.addDisp(MI.getOperand(i), LabelOffset);
else
- MIB.addOperand(MI->getOperand(i));
+ MIB.addOperand(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload SP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
- MIB.addDisp(MI->getOperand(i), SPOffset);
+ MIB.addDisp(MI.getOperand(i), SPOffset);
else
- MIB.addOperand(MI->getOperand(i));
+ MIB.addOperand(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Jump
BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
- MI->eraseFromParent();
+ MI.eraseFromParent();
return MBB;
}
+void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB,
+ int FI) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+ MVT PVT = getPointerTy(MF->getDataLayout());
+ assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
+
+ unsigned Op = 0;
+ unsigned VR = 0;
+
+ bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+ !isPositionIndependent();
+
+ if (UseImmLabel) {
+ Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ } else {
+ const TargetRegisterClass *TRC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ VR = MRI->createVirtualRegister(TRC);
+ Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+
+ /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
+
+ if (Subtarget.is64Bit())
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB)
+ .addReg(0);
+ else
+ BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+ .addReg(0) /* XII->getGlobalBaseReg(MF) */
+ .addImm(1)
+ .addReg(0)
+ .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+ addFrameReference(MIB, FI, 36);
+ if (UseImmLabel)
+ MIB.addMBB(DispatchBB);
+ else
+ MIB.addReg(VR);
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ MachineModuleInfo *MMI = &MF->getMMI();
+ MachineFrameInfo *MFI = MF->getFrameInfo();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ int FI = MFI->getFunctionContextIndex();
+
+ // Get a mapping of the call site numbers to all of the landing pads they're
+ // associated with.
+ DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+ unsigned MaxCSNum = 0;
+ for (auto &MBB : *MF) {
+ if (!MBB.isEHPad())
+ continue;
+
+ MCSymbol *Sym = nullptr;
+ for (const auto &MI : MBB) {
+ if (MI.isDebugValue())
+ continue;
+
+ assert(MI.isEHLabel() && "expected EH_LABEL");
+ Sym = MI.getOperand(0).getMCSymbol();
+ break;
+ }
+
+ if (!MMI->hasCallSiteLandingPad(Sym))
+ continue;
+
+ for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
+ CallSiteNumToLPad[CSI].push_back(&MBB);
+ MaxCSNum = std::max(MaxCSNum, CSI);
+ }
+ }
+
+ // Get an ordered list of the machine basic blocks for the jump table.
+ std::vector<MachineBasicBlock *> LPadList;
+ SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+ LPadList.reserve(CallSiteNumToLPad.size());
+
+ for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+ for (auto &LP : CallSiteNumToLPad[CSI]) {
+ LPadList.push_back(LP);
+ InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+ }
+ }
+
+ assert(!LPadList.empty() &&
+ "No landing pad destinations for the dispatch jump table!");
+
+ // Create the MBBs for the dispatch code.
+
+ // Shove the dispatch's address into the return slot in the function context.
+ MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+ DispatchBB->setIsEHPad(true);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+ DispatchBB->addSuccessor(TrapBB);
+
+ MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(DispContBB);
+
+ // Insert MBBs.
+ MF->push_back(DispatchBB);
+ MF->push_back(DispContBB);
+ MF->push_back(TrapBB);
+
+ // Insert code into the entry block that creates and registers the function
+ // context.
+ SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
+
+ // Create the jump table and associated information
+ MachineJumpTableInfo *JTI =
+ MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
+ unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
+ const X86RegisterInfo &RI = XII->getRegisterInfo();
+
+ // Add a register mask with no preserved registers. This results in all
+ // registers being marked as clobbered.
+ if (RI.hasBasePointer(*MF)) {
+ const bool FPIs64Bit =
+ Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setRestoreBasePointer(MF);
+
+ unsigned FP = RI.getFrameRegister(*MF);
+ unsigned BP = RI.getBaseRegister();
+ unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+ MFI->getRestoreBasePointerOffset())
+ .addRegMask(RI.getNoPreservedMask());
+ } else {
+ BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+ .addRegMask(RI.getNoPreservedMask());
+ }
+
+ unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+ 4);
+ BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+ .addReg(IReg)
+ .addImm(LPadList.size());
+ BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
+
+ unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
+ .addReg(IReg)
+ .addImm(1);
+ BuildMI(DispContBB, DL,
+ TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
+ .addReg(0)
+ .addImm(Subtarget.is64Bit() ? 8 : 4)
+ .addReg(JReg)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+
+ // Add the jump table entries as successors to the MBB.
+ SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+ for (auto &LP : LPadList)
+ if (SeenMBBs.insert(LP).second)
+ DispContBB->addSuccessor(LP);
+
+ // N.B. the order the invoke BBs are processed in doesn't matter here.
+ SmallVector<MachineBasicBlock *, 64> MBBLPads;
+ const MCPhysReg *SavedRegs =
+ Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+ for (MachineBasicBlock *MBB : InvokeBBs) {
+ // Remove the landing pad successor from the invoke block and replace it
+ // with the new dispatch block.
+ // Keep a copy of Successors since it's modified inside the loop.
+ SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+ MBB->succ_rend());
+ // FIXME: Avoid quadratic complexity.
+ for (auto MBBS : Successors) {
+ if (MBBS->isEHPad()) {
+ MBB->removeSuccessor(MBBS);
+ MBBLPads.push_back(MBBS);
+ }
+ }
+
+ MBB->addSuccessor(DispatchBB);
+
+ // Find the invoke call and mark all of the callee-saved registers as
+ // 'implicit defined' so that they're spilled. This prevents code from
+ // moving instructions to before the EH block, where they will never be
+ // executed.
+ for (auto &II : reverse(*MBB)) {
+ if (!II.isCall())
+ continue;
+
+ DenseMap<unsigned, bool> DefRegs;
+ for (auto &MOp : II.operands())
+ if (MOp.isReg())
+ DefRegs[MOp.getReg()] = true;
+
+ MachineInstrBuilder MIB(*MF, &II);
+ for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+ unsigned Reg = SavedRegs[RI];
+ if (!DefRegs[Reg])
+ MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ break;
+ }
+ }
+
+ // Mark all former landing pads as non-landing pads. The dispatch is the only
+ // landing pad now.
+ for (auto &LP : MBBLPads)
+ LP->setIsEHPad(false);
+
+ // The instruction is gone now.
+ MI.eraseFromParent();
+ return BB;
+}
+
// Replace 213-type (isel default) FMA3 instructions with 231-type for
// accumulator loops. Writing back to the accumulator allows the coalescer
// to remove extra copies in the loop.
// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937).
MachineBasicBlock *
-X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- MachineOperand &AddendOp = MI->getOperand(3);
+ MachineOperand &AddendOp = MI.getOperand(3);
// Bail out early if the addend isn't a register - we can't switch these.
if (!AddendOp.isReg())
@@ -22565,55 +24214,120 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
assert(AddendDef.getOperand(i).isReg());
MachineOperand PHISrcOp = AddendDef.getOperand(i);
MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
- if (&PHISrcInst == MI) {
+ if (&PHISrcInst == &MI) {
// Found a matching instruction.
unsigned NewFMAOpc = 0;
- switch (MI->getOpcode()) {
- case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
- case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
- case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
- case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
- case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
- case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
- case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
- case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
- case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
- case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
- case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
- case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
- case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
- case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
- case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
- case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
- case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
- case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
- case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
- case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
-
- case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
- case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
- case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
- case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
- case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
- case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
- case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
- case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
- case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
- case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
- case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
- case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
- default: llvm_unreachable("Unrecognized FMA variant.");
+ switch (MI.getOpcode()) {
+ case X86::VFMADDPDr213r:
+ NewFMAOpc = X86::VFMADDPDr231r;
+ break;
+ case X86::VFMADDPSr213r:
+ NewFMAOpc = X86::VFMADDPSr231r;
+ break;
+ case X86::VFMADDSDr213r:
+ NewFMAOpc = X86::VFMADDSDr231r;
+ break;
+ case X86::VFMADDSSr213r:
+ NewFMAOpc = X86::VFMADDSSr231r;
+ break;
+ case X86::VFMSUBPDr213r:
+ NewFMAOpc = X86::VFMSUBPDr231r;
+ break;
+ case X86::VFMSUBPSr213r:
+ NewFMAOpc = X86::VFMSUBPSr231r;
+ break;
+ case X86::VFMSUBSDr213r:
+ NewFMAOpc = X86::VFMSUBSDr231r;
+ break;
+ case X86::VFMSUBSSr213r:
+ NewFMAOpc = X86::VFMSUBSSr231r;
+ break;
+ case X86::VFNMADDPDr213r:
+ NewFMAOpc = X86::VFNMADDPDr231r;
+ break;
+ case X86::VFNMADDPSr213r:
+ NewFMAOpc = X86::VFNMADDPSr231r;
+ break;
+ case X86::VFNMADDSDr213r:
+ NewFMAOpc = X86::VFNMADDSDr231r;
+ break;
+ case X86::VFNMADDSSr213r:
+ NewFMAOpc = X86::VFNMADDSSr231r;
+ break;
+ case X86::VFNMSUBPDr213r:
+ NewFMAOpc = X86::VFNMSUBPDr231r;
+ break;
+ case X86::VFNMSUBPSr213r:
+ NewFMAOpc = X86::VFNMSUBPSr231r;
+ break;
+ case X86::VFNMSUBSDr213r:
+ NewFMAOpc = X86::VFNMSUBSDr231r;
+ break;
+ case X86::VFNMSUBSSr213r:
+ NewFMAOpc = X86::VFNMSUBSSr231r;
+ break;
+ case X86::VFMADDSUBPDr213r:
+ NewFMAOpc = X86::VFMADDSUBPDr231r;
+ break;
+ case X86::VFMADDSUBPSr213r:
+ NewFMAOpc = X86::VFMADDSUBPSr231r;
+ break;
+ case X86::VFMSUBADDPDr213r:
+ NewFMAOpc = X86::VFMSUBADDPDr231r;
+ break;
+ case X86::VFMSUBADDPSr213r:
+ NewFMAOpc = X86::VFMSUBADDPSr231r;
+ break;
+
+ case X86::VFMADDPDr213rY:
+ NewFMAOpc = X86::VFMADDPDr231rY;
+ break;
+ case X86::VFMADDPSr213rY:
+ NewFMAOpc = X86::VFMADDPSr231rY;
+ break;
+ case X86::VFMSUBPDr213rY:
+ NewFMAOpc = X86::VFMSUBPDr231rY;
+ break;
+ case X86::VFMSUBPSr213rY:
+ NewFMAOpc = X86::VFMSUBPSr231rY;
+ break;
+ case X86::VFNMADDPDr213rY:
+ NewFMAOpc = X86::VFNMADDPDr231rY;
+ break;
+ case X86::VFNMADDPSr213rY:
+ NewFMAOpc = X86::VFNMADDPSr231rY;
+ break;
+ case X86::VFNMSUBPDr213rY:
+ NewFMAOpc = X86::VFNMSUBPDr231rY;
+ break;
+ case X86::VFNMSUBPSr213rY:
+ NewFMAOpc = X86::VFNMSUBPSr231rY;
+ break;
+ case X86::VFMADDSUBPDr213rY:
+ NewFMAOpc = X86::VFMADDSUBPDr231rY;
+ break;
+ case X86::VFMADDSUBPSr213rY:
+ NewFMAOpc = X86::VFMADDSUBPSr231rY;
+ break;
+ case X86::VFMSUBADDPDr213rY:
+ NewFMAOpc = X86::VFMSUBADDPDr231rY;
+ break;
+ case X86::VFMSUBADDPSr213rY:
+ NewFMAOpc = X86::VFMSUBADDPSr231rY;
+ break;
+ default:
+ llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineInstrBuilder MIB =
- BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
- .addOperand(MI->getOperand(0))
- .addOperand(MI->getOperand(3))
- .addOperand(MI->getOperand(2))
- .addOperand(MI->getOperand(1));
+ BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
+ .addOperand(MI.getOperand(0))
+ .addOperand(MI.getOperand(3))
+ .addOperand(MI.getOperand(2))
+ .addOperand(MI.getOperand(1));
MBB->insert(MachineBasicBlock::iterator(MI), MIB);
- MI->eraseFromParent();
+ MI.eraseFromParent();
}
}
@@ -22621,9 +24335,9 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TAILJMPd64:
case X86::TAILJMPr64:
@@ -22641,8 +24355,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
- case X86::WIN_ALLOCA:
- return EmitLoweredWinAlloca(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case X86::CATCHPAD:
@@ -22679,31 +24391,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::RDFLAGS32:
case X86::RDFLAGS64: {
- DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
unsigned PushF =
- MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
- unsigned Pop =
- MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
- BuildMI(*BB, MI, DL, TII->get(PushF));
- BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
-
- MI->eraseFromParent(); // The pseudo is gone now.
+ MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+ unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+ MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
+ // Permit reads of the FLAGS register without it being defined.
+ // This intrinsic exists to read external processor state in flags, such as
+ // the trap flag, interrupt flag, and direction flag, none of which are
+ // modeled by the backend.
+ Push->getOperand(2).setIsUndef();
+ BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
+
+ MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
case X86::WRFLAGS32:
case X86::WRFLAGS64: {
- DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
unsigned Push =
- MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
unsigned PopF =
- MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
- BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+ MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+ BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
BuildMI(*BB, MI, DL, TII->get(PopF));
- MI->eraseFromParent(); // The pseudo is gone now.
+ MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
@@ -22721,8 +24437,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
@@ -22750,7 +24466,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// Get the X86 opcode to use.
unsigned Opc;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
@@ -22763,35 +24479,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
}
- X86AddressMode AM;
- MachineOperand &Op = MI->getOperand(0);
- if (Op.isReg()) {
- AM.BaseType = X86AddressMode::RegBase;
- AM.Base.Reg = Op.getReg();
- } else {
- AM.BaseType = X86AddressMode::FrameIndexBase;
- AM.Base.FrameIndex = Op.getIndex();
- }
- Op = MI->getOperand(1);
- if (Op.isImm())
- AM.Scale = Op.getImm();
- Op = MI->getOperand(2);
- if (Op.isImm())
- AM.IndexReg = Op.getImm();
- Op = MI->getOperand(3);
- if (Op.isGlobal()) {
- AM.GV = Op.getGlobal();
- } else {
- AM.Disp = Op.getImm();
- }
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
- .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+ .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FLDCW16m)), CWFrameIdx);
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
// String/text processing lowering.
@@ -22803,9 +24499,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRM128REG:
case X86::PCMPESTRM128MEM:
case X86::VPCMPESTRM128MEM:
- assert(Subtarget->hasSSE42() &&
+ assert(Subtarget.hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
+ return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
@@ -22816,21 +24512,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRIREG:
case X86::PCMPESTRIMEM:
case X86::VPCMPESTRIMEM:
- assert(Subtarget->hasSSE42() &&
+ assert(Subtarget.hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
+ return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, Subtarget);
+ return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
+ case X86::MONITORX:
+ return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
// PKU feature
case X86::WRPKRU:
- return EmitWRPKRU(MI, BB, Subtarget);
+ return emitWRPKRU(MI, BB, Subtarget);
case X86::RDPKRU:
- return EmitRDPKRU(MI, BB, Subtarget);
+ return emitRDPKRU(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
+ return emitXBegin(MI, BB, Subtarget.getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -22846,6 +24544,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
+ case X86::Int_eh_sjlj_setup_dispatch:
+ return EmitSjLjDispatchBlock(MI, BB);
+
case TargetOpcode::STATEPOINT:
// As an implementation detail, STATEPOINT shares the STACKMAP format at
// this point in the process. We diverge later.
@@ -22888,6 +24589,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VFMSUBADDPDr213rY:
case X86::VFMSUBADDPSr213rY:
return emitFMA3Instr(MI, BB);
+ case X86::LCMPXCHG8B_SAVE_EBX:
+ case X86::LCMPXCHG16B_SAVE_RBX: {
+ unsigned BasePtr =
+ MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+ if (!BB->isLiveIn(BasePtr))
+ BB->addLiveIn(BasePtr);
+ return BB;
+ }
}
}
@@ -22930,33 +24639,9 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::SETCC:
KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
break;
- case ISD::INTRINSIC_WO_CHAIN: {
- unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- unsigned NumLoBits = 0;
- switch (IntId) {
- default: break;
- case Intrinsic::x86_sse_movmsk_ps:
- case Intrinsic::x86_avx_movmsk_ps_256:
- case Intrinsic::x86_sse2_movmsk_pd:
- case Intrinsic::x86_avx_movmsk_pd_256:
- case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse2_pmovmskb_128:
- case Intrinsic::x86_avx2_pmovmskb: {
- // High bits of movmskp{s|d}, pmovmskb are known zero.
- switch (IntId) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
- case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
- case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
- case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
- case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
- case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
- case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
- }
- KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
- break;
- }
- }
+ case X86ISD::MOVMSK: {
+ unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
break;
}
}
@@ -22974,8 +24659,8 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
}
-/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-/// node is a GlobalAddress + offset.
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
const GlobalValue* &GA,
int64_t &Offset) const {
@@ -22989,11 +24674,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// Performs shuffle combines for 256-bit vectors.
/// FIXME: This could be expanded to support 512 bit vectors as well.
-static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget* Subtarget) {
+static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc dl(N);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
SDValue V1 = SVOp->getOperand(0);
@@ -23014,8 +24699,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// RESULT: V + zero extended
//
if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
- V2.getOperand(1).getOpcode() != ISD::UNDEF ||
- V1.getOperand(1).getOpcode() != ISD::UNDEF)
+ !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
return SDValue();
if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
@@ -23060,195 +24744,556 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// Emit a zeroed vector and insert the desired subvector on its
// first half.
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+ SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}
return SDValue();
}
+// Attempt to match a combined shuffle mask against supported unary shuffle
+// instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT) {
+ bool FloatDomain = SrcVT.isFloatingPoint() ||
+ (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
+
+ // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
+ if (!FloatDomain && SrcVT.is128BitVector() &&
+ isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ ShuffleVT = MVT::v2i64;
+ return true;
+ }
+
+ // Check if we have SSE3 which will let us use MOVDDUP etc. The
+ // instructions are no slower than UNPCKLPD but has the option to
+ // fold the input operand into even an unaligned memory load.
+ if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+ Shuffle = X86ISD::MOVDDUP;
+ ShuffleVT = MVT::v2f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ }
+
+ if (SrcVT.is256BitVector() && FloatDomain) {
+ assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ Shuffle = X86ISD::MOVDDUP;
+ ShuffleVT = MVT::v4f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ ShuffleVT = MVT::v8f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ ShuffleVT = MVT::v8f32;
+ return true;
+ }
+ }
+
+ if (SrcVT.is512BitVector() && FloatDomain) {
+ assert(Subtarget.hasAVX512() &&
+ "AVX512 required for 512-bit vector shuffles");
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ Shuffle = X86ISD::MOVDDUP;
+ ShuffleVT = MVT::v8f64;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+ Shuffle = X86ISD::MOVSLDUP;
+ ShuffleVT = MVT::v16f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(
+ Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+ Shuffle = X86ISD::MOVSHDUP;
+ ShuffleVT = MVT::v16f32;
+ return true;
+ }
+ }
+
+ // Attempt to match against broadcast-from-vector.
+ if (Subtarget.hasAVX2()) {
+ unsigned NumElts = Mask.size();
+ SmallVector<int, 64> BroadcastMask(NumElts, 0);
+ if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
+ ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
+ : MVT::getIntegerVT(EltSize);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
+ Shuffle = X86ISD::VBROADCAST;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Attempt to match a combined shuffle mask against supported unary immediate
+// permute instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ // Ensure we don't contain any zero elements.
+ for (int M : Mask) {
+ if (M == SM_SentinelZero)
+ return false;
+ assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
+ "Expected unary shuffle");
+ }
+
+ unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
+ MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+ // Handle PSHUFLW/PSHUFHW repeated patterns.
+ if (MaskScalarSizeInBits == 16) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ ArrayRef<int> LoMask(Mask.data() + 0, 4);
+ ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+ // PSHUFLW: permute lower 4 elements only.
+ if (isUndefOrInRange(LoMask, 0, 4) &&
+ isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+ Shuffle = X86ISD::PSHUFLW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+ PermuteImm = getV4X86ShuffleImm(LoMask);
+ return true;
+ }
+
+ // PSHUFHW: permute upper 4 elements only.
+ if (isUndefOrInRange(HiMask, 4, 8) &&
+ isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+ // Offset the HiMask so that we can create the shuffle immediate.
+ int OffsetHiMask[4];
+ for (int i = 0; i != 4; ++i)
+ OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+ Shuffle = X86ISD::PSHUFHW;
+ ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+ PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+ return true;
+ }
+
+ return false;
+ }
+ return false;
+ }
+
+ // We only support permutation of 32/64 bit elements after this.
+ if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
+ return false;
+
+ // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+ // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+ bool FloatDomain = SrcVT.isFloatingPoint();
+ if (FloatDomain && !Subtarget.hasAVX())
+ return false;
+
+ // Pre-AVX2 we must use float shuffles on 256-bit vectors.
+ if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
+ FloatDomain = true;
+
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // VPERMILPD can permute with a non-repeating shuffle.
+ if (FloatDomain && MaskScalarSizeInBits == 64) {
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
+ return true;
+ }
+
+ // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
+ SmallVector<int, 4> RepeatedMask;
+ if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
+ return false;
+
+ // Narrow the repeated mask for 32-bit element permutes.
+ SmallVector<int, 4> WordMask = RepeatedMask;
+ if (MaskScalarSizeInBits == 64)
+ scaleShuffleMask(2, RepeatedMask, WordMask);
+
+ Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+ ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
+ PermuteImm = getV4X86ShuffleImm(WordMask);
+ return true;
+}
+
+// Attempt to match a combined unary shuffle mask against supported binary
+// shuffle instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+ unsigned &Shuffle, MVT &ShuffleVT) {
+ bool FloatDomain = SrcVT.isFloatingPoint();
+
+ if (SrcVT.is128BitVector()) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+ Shuffle = X86ISD::MOVLHPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+ Shuffle = X86ISD::MOVHLPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
+ isTargetShuffleEquivalent(
+ Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+ return true;
+ }
+ if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
+ isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
+ 13, 14, 14, 15, 15})) {
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
-/// This is the leaf of the recursive combinine below. When we have found some
+/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
-static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
- int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
+ ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
- assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+ const X86Subtarget &Subtarget) {
+ assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
// Find the operand that enters the chain. Note that multiple uses are OK
// here, we're not going to remove the operand we find.
- SDValue Input = Op.getOperand(0);
- while (Input.getOpcode() == ISD::BITCAST)
- Input = Input.getOperand(0);
+ Input = peekThroughBitcasts(Input);
MVT VT = Input.getSimpleValueType();
MVT RootVT = Root.getSimpleValueType();
SDLoc DL(Root);
- if (Mask.size() == 1) {
- int Index = Mask[0];
- assert((Index >= 0 || Index == SM_SentinelUndef ||
- Index == SM_SentinelZero) &&
- "Invalid shuffle index found!");
-
- // We may end up with an accumulated mask of size 1 as a result of
- // widening of shuffle operands (see function canWidenShuffleElements).
- // If the only shuffle index is equal to SM_SentinelZero then propagate
- // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
- // mask, and therefore the entire chain of shuffles can be folded away.
- if (Index == SM_SentinelZero)
- DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
- else
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
- /*AddTo*/ true);
+ SDValue Res;
+
+ unsigned NumBaseMaskElts = BaseMask.size();
+ if (NumBaseMaskElts == 1) {
+ assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+ /*AddTo*/ true);
return true;
}
- // Use the float domain if the operand type is a floating point type.
- bool FloatDomain = VT.isFloatingPoint();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
- // For floating point shuffles, we don't have free copies in the shuffle
- // instructions or the ability to load as part of the instruction, so
- // canonicalize their shuffles to UNPCK or MOV variants.
- //
- // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
- // vectors because it can have a load folded into it that UNPCK cannot. This
- // doesn't preclude something switching to the shorter encoding post-RA.
- //
- // FIXME: Should teach these routines about AVX vector widths.
- if (FloatDomain && VT.is128BitVector()) {
- if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
- bool Lo = Mask.equals({0, 0});
- unsigned Shuffle;
- MVT ShuffleVT;
- // Check if we have SSE3 which will let us use MOVDDUP. That instruction
- // is no slower than UNPCKLPD but has the option to fold the input operand
- // into even an unaligned memory load.
- if (Lo && Subtarget->hasSSE3()) {
- Shuffle = X86ISD::MOVDDUP;
- ShuffleVT = MVT::v2f64;
- } else {
- // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
- // than the UNPCK variants.
- Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
- ShuffleVT = MVT::v4f32;
- }
- if (Depth == 1 && Root->getOpcode() == Shuffle)
- return false; // Nothing to do!
- Op = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Op.getNode());
- if (Shuffle == X86ISD::MOVDDUP)
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
- else
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
- DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
- /*AddTo*/ true);
- return true;
- }
- if (Subtarget->hasSSE3() &&
- (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
- bool Lo = Mask.equals({0, 0, 2, 2});
- unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
- MVT ShuffleVT = MVT::v4f32;
- if (Depth == 1 && Root->getOpcode() == Shuffle)
- return false; // Nothing to do!
- Op = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Op.getNode());
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
- DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ // Don't combine if we are a AVX512/EVEX target and the mask element size
+ // is different from the root element size - this would prevent writemasks
+ // from being reused.
+ // TODO - this currently prevents all lane shuffles from occurring.
+ // TODO - check for writemasks usage instead of always preventing combining.
+ // TODO - attempt to narrow Mask back to writemask size.
+ if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
+ (RootSizeInBits == 512 ||
+ (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
+ return false;
+ }
+
+ // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+
+ // Handle 128-bit lane shuffles of 256-bit vectors.
+ if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
+ !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+ return false; // Nothing to do!
+ MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
+ : MVT::v4i64);
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+ Res = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+ DAG.getUNDEF(ShuffleVT),
+ DAG.getConstant(PermMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // For masks that have been widened to 128-bit elements or more,
+ // narrow back down to 64-bit elements.
+ SmallVector<int, 64> Mask;
+ if (BaseMaskEltSizeInBits > 64) {
+ assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
+ int MaskScale = BaseMaskEltSizeInBits / 64;
+ scaleShuffleMask(MaskScale, BaseMask, Mask);
+ } else {
+ Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+ }
+
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
+
+ // Determine the effective mask value type.
+ bool FloatDomain =
+ (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
+ (32 <= MaskEltSizeInBits);
+ MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
+ : MVT::getIntegerVT(MaskEltSizeInBits);
+ MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+
+ // Attempt to match the mask against known shuffle patterns.
+ MVT ShuffleVT;
+ unsigned Shuffle, PermuteImm;
+
+ if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Res = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Res = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+ DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ Res = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // Attempt to blend with zero.
+ if (NumMaskElts <= 8 &&
+ ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
+ (Subtarget.hasAVX() && VT.is256BitVector()))) {
+ // Convert VT to a type compatible with X86ISD::BLENDI.
+ // TODO - add 16i16 support (requires lane duplication).
+ MVT ShuffleVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v8i32;
+ else if (ShuffleVT == MVT::v2i64)
+ ShuffleVT = MVT::v4i32;
+ } else {
+ if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+ ShuffleVT = MVT::v8i16;
+ else if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v4f64;
+ else if (ShuffleVT == MVT::v8i32)
+ ShuffleVT = MVT::v8f32;
+ }
+
+ if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
+ /*Low*/ 0) &&
+ NumMaskElts <= ShuffleVT.getVectorNumElements()) {
+ unsigned BlendMask = 0;
+ unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
+ unsigned MaskRatio = ShuffleSize / NumMaskElts;
+
+ if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
+ return false;
+
+ for (unsigned i = 0; i != ShuffleSize; ++i)
+ if (Mask[i / MaskRatio] < 0)
+ BlendMask |= 1u << i;
+
+ SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
+ Res = DAG.getBitcast(ShuffleVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
- if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
- bool Lo = Mask.equals({0, 0, 1, 1});
- unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
- MVT ShuffleVT = MVT::v4f32;
- if (Depth == 1 && Root->getOpcode() == Shuffle)
+ }
+
+ // Attempt to combine to INSERTPS.
+ if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
+ (VT == MVT::v2f64 || VT == MVT::v4f32)) {
+ SmallBitVector Zeroable(4, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] < 0)
+ Zeroable[i] = true;
+
+ unsigned InsertPSMask;
+ SDValue V1 = Input, V2 = Input;
+ if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
+ Zeroable, Mask, DAG)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
return false; // Nothing to do!
- Op = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Op.getNode());
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
- DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
}
- // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
- // variants as none of these have single-instruction variants that are
- // superior to the UNPCK formulation.
- if (!FloatDomain && VT.is128BitVector() &&
- (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
- Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
- Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
- Mask.equals(
- {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
- bool Lo = Mask[0] == 0;
- unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
- if (Depth == 1 && Root->getOpcode() == Shuffle)
- return false; // Nothing to do!
- MVT ShuffleVT;
- switch (Mask.size()) {
- case 8:
- ShuffleVT = MVT::v8i16;
- break;
- case 16:
- ShuffleVT = MVT::v16i8;
- break;
- default:
- llvm_unreachable("Impossible mask size!");
- };
- Op = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Op.getNode());
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
- DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
- /*AddTo*/ true);
- return true;
- }
-
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
return false;
- // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
- // can replace them with a single PSHUFB instruction profitably. Intel's
- // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
- // in practice PSHUFB tends to be *very* fast so we're more aggressive.
- if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
+ return false;
+
+ bool MaskContainsZeros =
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // the 128-bit lanes use the variable mask to VPERMILPS.
+ // TODO Combine other mask types at higher depths.
+ if (HasVariableMask && !MaskContainsZeros &&
+ ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+ SmallVector<SDValue, 16> VPermIdx;
+ for (int M : Mask) {
+ SDValue Idx =
+ M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+ VPermIdx.push_back(Idx);
+ }
+ MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+ SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ // If we have 3 or more shuffle instructions or a chain involving a variable
+ // mask, we can replace them with a single PSHUFB instruction profitably.
+ // Intel's manuals suggest only using PSHUFB if doing so replacing 5
+ // instructions, but in practice PSHUFB tends to be *very* fast so we're
+ // more aggressive.
+ if ((Depth >= 3 || HasVariableMask) &&
+ ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (VT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (VT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<SDValue, 16> PSHUFBMask;
int NumBytes = VT.getSizeInBits() / 8;
- int Ratio = NumBytes / Mask.size();
+ int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
- if (Mask[i / Ratio] == SM_SentinelUndef) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
}
- int M = Mask[i / Ratio] != SM_SentinelZero
- ? Ratio * Mask[i / Ratio] + i % Ratio
- : 255;
+ if (M == SM_SentinelZero) {
+ PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ assert ((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
- Op = DAG.getBitcast(ByteVT, Input);
- DCI.AddToWorklist(Op.getNode());
- SDValue PSHUFBMaskOp =
- DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
+ Res = DAG.getBitcast(ByteVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
- Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
- DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+ Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
@@ -23288,10 +25333,10 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
/// combining in this recursive walk.
static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
ArrayRef<int> RootMask,
- int Depth, bool HasPSHUFB,
+ int Depth, bool HasVariableMask,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
if (Depth > 8)
@@ -23310,13 +25355,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
"Can only combine shuffles of the same vector register size.");
- if (!isTargetShuffle(Op.getOpcode()))
- return false;
+ // Extract target shuffle mask and resolve sentinels and inputs.
+ SDValue Input0, Input1;
SmallVector<int, 16> OpMask;
- bool IsUnary;
- bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
- // We only can combine unary shuffles which we can decode the mask for.
- if (!HaveMask || !IsUnary)
+ if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
return false;
assert(VT.getVectorNumElements() == OpMask.size() &&
@@ -23327,6 +25369,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
OpMask.size() % RootMask.size() == 0) ||
OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.");
+ int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
assert(((RootRatio == 1 && OpRatio == 1) ||
@@ -23334,13 +25377,13 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
"Must not have a ratio for both incoming and op masks!");
SmallVector<int, 16> Mask;
- Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+ Mask.reserve(MaskWidth);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by the
// root mask to get us all the way to the root value arrangement. The reason
// for this order is that we are recursing up the operation chain.
- for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+ for (int i = 0; i < MaskWidth; ++i) {
int RootIdx = i / RootRatio;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
@@ -23362,45 +25405,56 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
RootMaskedIdx % OpRatio);
}
- // See if we can recurse into the operand to combine more things.
- switch (Op.getOpcode()) {
- case X86ISD::PSHUFB:
- HasPSHUFB = true;
- case X86ISD::PSHUFD:
- case X86ISD::PSHUFHW:
- case X86ISD::PSHUFLW:
- if (Op.getOperand(0).hasOneUse() &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ // Handle the all undef/zero cases early.
+ if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
+ DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
+ return true;
+ }
+ if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
+ // TODO - should we handle the mixed zero/undef case as well? Just returning
+ // a zero mask will lose information on undef elements possibly reducing
+ // future combine possibilities.
+ DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
+ Subtarget, DAG, SDLoc(Root)));
+ return true;
+ }
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH:
- assert(Op.getOperand(0) == Op.getOperand(1) &&
- "We only combine unary shuffles!");
- // We can't check for single use, we have to check that this shuffle is the
- // only user.
- if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ int MaskSize = Mask.size();
+ bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
+ [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
+ bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
+ [MaskSize](int Idx) { return MaskSize <= Idx; });
+
+ // At the moment we can only combine unary shuffle mask cases.
+ if (UseInput0 && UseInput1)
+ return false;
+ else if (UseInput1) {
+ std::swap(Input0, Input1);
+ ShuffleVectorSDNode::commuteMask(Mask);
}
+ assert(Input0 && "Shuffle with no inputs detected");
+
+ HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+
+ // See if we can recurse into Input0 (if it's a target shuffle).
+ if (Op->isOnlyUserOf(Input0.getNode()) &&
+ combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
+ HasVariableMask, DAG, DCI, Subtarget))
+ return true;
+
// Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with squential pairs of
+ // to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
SmallVector<int, 16> WidenedMask;
while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
Mask = std::move(WidenedMask);
- WidenedMask.clear();
}
- return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
- Subtarget);
+ return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
+ DCI, Subtarget);
}
/// \brief Get the PSHUF-style mask from PSHUF node.
@@ -23410,8 +25464,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
+ SmallVector<SDValue, 2> Ops;
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary);
+ bool HaveMask =
+ getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
@@ -23647,9 +25703,9 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
}
/// \brief Try to combine x86 target specific shuffles.
-static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
@@ -23681,8 +25737,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
auto Op0 = N.getOperand(0);
auto Op1 = N.getOperand(1);
- if (Op0.getOpcode() == ISD::UNDEF &&
- Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
unsigned NumElts = VT.getVectorNumElements();
@@ -23719,6 +25774,129 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
}
+ // Attempt to merge blend(insertps(x,y),zero).
+ if (V0.getOpcode() == X86ISD::INSERTPS ||
+ V1.getOpcode() == X86ISD::INSERTPS) {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+
+ // Determine which elements are known to be zero.
+ SmallVector<int, 8> TargetMask;
+ SmallVector<SDValue, 2> BlendOps;
+ if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
+ return SDValue();
+
+ // Helper function to take inner insertps node and attempt to
+ // merge the blend with zero into its zero mask.
+ auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
+ if (V.getOpcode() != X86ISD::INSERTPS)
+ return SDValue();
+ SDValue Op0 = V.getOperand(0);
+ SDValue Op1 = V.getOperand(1);
+ SDValue Op2 = V.getOperand(2);
+ unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+
+ // Check each element of the blend node's target mask - must either
+ // be zeroable (and update the zero mask) or selects the element from
+ // the inner insertps node.
+ for (int i = 0; i != 4; ++i)
+ if (TargetMask[i] < 0)
+ InsertPSMask |= (1u << i);
+ else if (TargetMask[i] != (i + Offset))
+ return SDValue();
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ };
+
+ if (SDValue V = MergeInsertPSAndBlend(V0, 0))
+ return V;
+ if (SDValue V = MergeInsertPSAndBlend(V1, 4))
+ return V;
+ }
+ return SDValue();
+ }
+ case X86ISD::INSERTPS: {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+ SDValue Op2 = N.getOperand(2);
+ unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
+ unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
+ unsigned ZeroMask = InsertPSMask & 0xF;
+
+ // If we zero out all elements from Op0 then we don't need to reference it.
+ if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+ // If we zero out the element from Op1 then we don't need to reference it.
+ if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+ // Attempt to merge insertps Op1 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask1;
+ SmallVector<SDValue, 2> Ops1;
+ if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
+ int M = TargetMask1[SrcIdx];
+ if (isUndefOrZero(M)) {
+ // Zero/UNDEF insertion - zero out element and remove dependency.
+ InsertPSMask |= (1u << DstIdx);
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ }
+ // Update insertps mask srcidx and reference the source input directly.
+ assert(0 <= M && M < 8 && "Shuffle index out of range");
+ InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
+ Op1 = Ops1[M < 4 ? 0 : 1];
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ }
+
+ // Attempt to merge insertps Op0 with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask0;
+ SmallVector<SDValue, 2> Ops0;
+ if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
+ return SDValue();
+
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ int M = TargetMask0[i];
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (isUndefOrZero(M)) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
+
+ // The input vector element must be inline.
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
+ }
+
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Ops0[1];
+ }
+
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
return SDValue();
}
default:
@@ -23814,12 +25992,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
/// the operands which explicitly discard the lanes which are unused by this
/// operation to try to flow through the rest of the combiner the fact that
/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
- if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
return SDValue();
// We only handle target-independent shuffles.
@@ -23865,13 +26043,10 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
}
-/// PerformShuffleCombine - Performs several different shuffle combines.
-static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc dl(N);
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
// Don't create instructions with illegal types after legalize types has run.
@@ -23886,9 +26061,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return AddSub;
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
- if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
+ if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
N->getOpcode() == ISD::VECTOR_SHUFFLE)
- return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+ return combineShuffle256(N, DAG, DCI, Subtarget);
// During Type Legalization, when promoting illegal vector types,
// the backend might introduce new shuffle dag nodes and bitcasts.
@@ -23903,8 +26078,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
// potentially need to be further expanded (or custom lowered) into a
// less optimal sequence of dag nodes.
if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
- N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
- N0.getOpcode() == ISD::BITCAST) {
+ N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
SDValue BC0 = N0.getOperand(0);
EVT SVT = BC0.getValueType();
unsigned Opcode = BC0.getOpcode();
@@ -23936,7 +26115,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
- return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+ return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
}
}
}
@@ -23952,9 +26131,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return LD;
if (isTargetShuffle(N->getOpcode())) {
- SDValue Shuffle =
- PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
- if (Shuffle.getNode())
+ if (SDValue Shuffle =
+ combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
return Shuffle;
// Try recursively combining arbitrary sequences of x86 shuffle
@@ -23973,8 +26151,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
-/// specific shuffle of a load can be folded into a single element load.
+/// Check if a vector extract from a target-specific shuffle of a load can be
+/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
/// shuffles have been custom lowered so we need to handle those here.
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
@@ -24012,9 +26190,10 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleOps;
bool UnaryShuffle;
if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
- ShuffleMask, UnaryShuffle))
+ ShuffleOps, ShuffleMask, UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
@@ -24029,12 +26208,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
return DAG.getUNDEF(EltVT);
assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
- SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
- : InVec.getOperand(1);
+ SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
+ : ShuffleOps[1];
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getNumOperands() > 1 &&
- InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses =
+ (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
@@ -24068,18 +26247,16 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
- : InVec.getOperand(1);
- Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
- InVec.getOperand(0), Shuffle,
- &ShuffleMask[0]);
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
+ Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
+ ShuffleMask);
Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -24108,8 +26285,8 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
default: return SDValue();
}
- if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
- (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+ if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64)) &&
isa<ConstantSDNode>(N0.getOperand(1)) &&
N0.getOperand(0).getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getOperand(0).getValueType() == VT) {
@@ -24121,13 +26298,12 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
-/// generation and convert it from being a bunch of shuffles and extracts
-/// into a somewhat faster sequence. For i686, the best sequence is apparently
-/// storing the value and loading scalars back, while for x64 we should
-/// use 64-bit extracts and shifts.
-static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+/// Detect vector gather/scatter index generation and convert it from being a
+/// bunch of shuffles and extracts into a somewhat faster sequence.
+/// For i686, the best sequence is apparently storing the value and loading
+/// scalars back, while for x64 we should use 64-bit extracts and shifts.
+static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
@@ -24136,25 +26312,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
// Detect mmx to i32 conversion through a v2i32 elt extract.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
N->getValueType(0) == MVT::i32 &&
- InputVector.getValueType() == MVT::v2i32) {
+ InputVector.getValueType() == MVT::v2i32 &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ N->getConstantOperandVal(1) == 0) {
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
// The bitcast source is a direct mmx result.
- SDValue MMXSrc = InputVector.getNode()->getOperand(0);
if (MMXSrc.getValueType() == MVT::x86mmx)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
-
- // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
- if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
- MMXSrc.getValueType() == MVT::i64) {
- SDValue MMXSrcOp = MMXSrc.getOperand(0);
- if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
- MMXSrcOp.getValueType() == MVT::v1i64 &&
- MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0), MMXSrcOp.getOperand(0));
- }
+ return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
EVT VT = N->getValueType(0);
@@ -24236,7 +26401,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
// Store the value to a temporary stack slot.
SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo());
EVT ElementType = InputVector.getValueType().getVectorElementType();
unsigned EltSize = ElementType.getSizeInBits() / 8;
@@ -24251,10 +26416,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
// Load the scalar.
- Vals[i] = DAG.getLoad(ElementType, dl, Ch,
- ScalarAddr, MachinePointerInfo(),
- false, false, false, 0);
-
+ Vals[i] =
+ DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
}
}
@@ -24272,55 +26435,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue
-transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- SDLoc dl(N);
- SDValue Cond = N->getOperand(0);
- SDValue LHS = N->getOperand(1);
- SDValue RHS = N->getOperand(2);
-
- if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
- SDValue CondSrc = Cond->getOperand(0);
- if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
- Cond = CondSrc->getOperand(0);
- }
-
- if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
- return SDValue();
-
- // A vselect where all conditions and data are constants can be optimized into
- // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
- if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
- ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
- return SDValue();
-
- unsigned MaskValue = 0;
- if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
- return SDValue();
-
- MVT VT = N->getSimpleValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> ShuffleMask(NumElems, -1);
- for (unsigned i = 0; i < NumElems; ++i) {
- // Be sure we emit undef where we can.
- if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
- ShuffleMask[i] = -1;
- else
- ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
- }
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
- return SDValue();
- return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
-}
-
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
-/// nodes.
-static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// Do target-specific dag combines on SELECT and VSELECT nodes.
+static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue Cond = N->getOperand(0);
// Get the LHS/RHS of the select.
@@ -24337,8 +26455,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && VT != MVT::f128 &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
- (Subtarget->hasSSE2() ||
- (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+ (Subtarget.hasSSE2() ||
+ (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
unsigned Opcode = 0;
@@ -24476,7 +26594,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
EVT CondVT = Cond.getValueType();
- if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+ if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1) {
// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
// lowering on KNL. In this case we convert it to
@@ -24487,7 +26605,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16) &&
- !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
+ !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
DCI.AddToWorklist(Cond.getNode());
return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
@@ -24625,8 +26743,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Match VSELECTs into subs with unsigned saturation.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
- ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
- (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Check if one of the arms of the VSELECT is a zero vector. If it's on the
@@ -24730,25 +26848,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // We should generate an X86ISD::BLENDI from a vselect if its argument
- // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
- // constants. This specific pattern gets generated when we split a
- // selector for a 512 bit vector in a machine without AVX512 (but with
- // 256-bit vectors), during legalization:
- //
- // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
- //
- // Iff we find this pattern and the build_vectors are built from
- // constants, we translate the vselect into a shuffle_vector that we
- // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if ((N->getOpcode() == ISD::VSELECT ||
- N->getOpcode() == X86ISD::SHRUNKBLEND) &&
- !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
- SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
- if (Shuffle.getNode())
- return Shuffle;
- }
-
// If this is a *dynamic* select (non-constant condition) and we can match
// this node with one of the variable blend instructions, restructure the
// condition so that the blends can use the high bit of each element and use
@@ -24780,10 +26879,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (VT.getVectorElementType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
- if (VT.is128BitVector() && !Subtarget->hasSSE41())
+ if (VT.is128BitVector() && !Subtarget.hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
- if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
+ if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
@@ -24837,6 +26936,73 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Combine:
+/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
+/// to:
+/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
+ SelectionDAG &DAG) {
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
+
+ // This only applies to variations of the common case:
+ // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
+ // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
+ // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
+ // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
+ // Using the proper condcodes (see below), overflow is checked for.
+
+ // FIXME: We can generalize both constraints:
+ // - XOR/OR/AND (if they were made to survive AtomicExpand)
+ // - LHS != 1
+ // if the result is compared.
+
+ SDValue CmpLHS = Cmp.getOperand(0);
+ SDValue CmpRHS = Cmp.getOperand(1);
+
+ if (!CmpLHS.hasOneUse())
+ return SDValue();
+
+ auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+ if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+ return SDValue();
+
+ const unsigned Opc = CmpLHS.getOpcode();
+
+ if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
+ return SDValue();
+
+ SDValue OpRHS = CmpLHS.getOperand(2);
+ auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
+ if (!OpRHSC)
+ return SDValue();
+
+ APInt Addend = OpRHSC->getAPIntValue();
+ if (Opc == ISD::ATOMIC_LOAD_SUB)
+ Addend = -Addend;
+
+ if (CC == X86::COND_S && Addend == 1)
+ CC = X86::COND_LE;
+ else if (CC == X86::COND_NS && Addend == 1)
+ CC = X86::COND_G;
+ else if (CC == X86::COND_G && Addend == -1)
+ CC = X86::COND_GE;
+ else if (CC == X86::COND_LE && Addend == -1)
+ CC = X86::COND_L;
+ else
+ return SDValue();
+
+ SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+ DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+ return LockOp;
+}
+
// Check whether a boolean test is testing a boolean value generated by
// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
// code.
@@ -24853,10 +27019,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// where Op could be BRCOND or CMOV.
//
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
- // Quit if not CMP and SUB with its value result used.
- if (Cmp.getOpcode() != X86ISD::CMP &&
- (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
- return SDValue();
+ // This combine only operates on CMP-like nodes.
+ if (!(Cmp.getOpcode() == X86ISD::CMP ||
+ (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+ return SDValue();
// Quit if not used as a boolean value.
if (CC != X86::COND_E && CC != X86::COND_NE)
@@ -24890,6 +27056,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// Skip (zext $x), (trunc $x), or (and $x, 1) node.
while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AssertZext ||
SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
@@ -24897,7 +27064,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
OpIdx = 1;
if (isOneConstant(SetCC.getOperand(1)))
OpIdx = 0;
- if (OpIdx == -1)
+ if (OpIdx < 0)
break;
SetCC = SetCC.getOperand(OpIdx);
truncatedToBoolWithAnd = true;
@@ -25008,10 +27175,20 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
return true;
}
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG) {
+ if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+ return R;
+ return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
-static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
// If the flag operand isn't dead, don't touch this CMOV.
@@ -25034,15 +27211,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
}
}
- SDValue Flags;
-
- Flags = checkBoolTestSetCCCombine(Cond, CC);
- if (Flags.getNode() &&
- // Extra check as FCMOV only supports a subset of X86 cond.
- (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
- SDValue Ops[] = { FalseOp, TrueOp,
- DAG.getConstant(CC, DL, MVT::i8), Flags };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ // Try to simplify the EFLAGS and condition code operands.
+ // We can't always do this as FCMOV only supports a subset of X86 cond.
+ if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+ if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+ SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+ Flags};
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ }
}
// If this is a select between two integer constants, try to do some
@@ -25218,11 +27394,216 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformMulCombine - Optimize a single multiply with constant into two
-/// in order to implement it with two cheaper instructions, e.g.
-/// LEA + SHL, LEA + LEA.
-static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+/// Different mul shrinking modes.
+enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+
+static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
+ EVT VT = N->getOperand(0).getValueType();
+ if (VT.getScalarSizeInBits() != 32)
+ return false;
+
+ assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
+ unsigned SignBits[2] = {1, 1};
+ bool IsPositive[2] = {false, false};
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue Opd = N->getOperand(i);
+
+ // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
+ // compute signbits for it separately.
+ if (Opd.getOpcode() == ISD::ANY_EXTEND) {
+ // For anyextend, it is safe to assume an appropriate number of leading
+ // sign/zero bits.
+ if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
+ SignBits[i] = 25;
+ else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
+ MVT::i16)
+ SignBits[i] = 17;
+ else
+ return false;
+ IsPositive[i] = true;
+ } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
+ // All the operands of BUILD_VECTOR need to be int constant.
+ // Find the smallest value range which all the operands belong to.
+ SignBits[i] = 32;
+ IsPositive[i] = true;
+ for (const SDValue &SubOp : Opd.getNode()->op_values()) {
+ if (SubOp.isUndef())
+ continue;
+ auto *CN = dyn_cast<ConstantSDNode>(SubOp);
+ if (!CN)
+ return false;
+ APInt IntVal = CN->getAPIntValue();
+ if (IntVal.isNegative())
+ IsPositive[i] = false;
+ SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
+ }
+ } else {
+ SignBits[i] = DAG.ComputeNumSignBits(Opd);
+ if (Opd.getOpcode() == ISD::ZERO_EXTEND)
+ IsPositive[i] = true;
+ }
+ }
+
+ bool AllPositive = IsPositive[0] && IsPositive[1];
+ unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
+ // When ranges are from -128 ~ 127, use MULS8 mode.
+ if (MinSignBits >= 25)
+ Mode = MULS8;
+ // When ranges are from 0 ~ 255, use MULU8 mode.
+ else if (AllPositive && MinSignBits >= 24)
+ Mode = MULU8;
+ // When ranges are from -32768 ~ 32767, use MULS16 mode.
+ else if (MinSignBits >= 17)
+ Mode = MULS16;
+ // When ranges are from 0 ~ 65535, use MULU16 mode.
+ else if (AllPositive && MinSignBits >= 16)
+ Mode = MULU16;
+ else
+ return false;
+ return true;
+}
+
+/// When the operands of vector mul are extended from smaller size values,
+/// like i8 and i16, the type of mul may be shrinked to generate more
+/// efficient code. Two typical patterns are handled:
+/// Pattern1:
+/// %2 = sext/zext <N x i8> %1 to <N x i32>
+/// %4 = sext/zext <N x i8> %3 to <N x i32>
+// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// Pattern2:
+/// %2 = zext/sext <N x i16> %1 to <N x i32>
+/// %4 = zext/sext <N x i16> %3 to <N x i32>
+/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+/// %5 = mul <N x i32> %2, %4
+///
+/// There are four mul shrinking modes:
+/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
+/// generate pmullw+sext32 for it (MULS8 mode).
+/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
+/// generate pmullw+zext32 for it (MULU8 mode).
+/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
+/// generate pmullw+pmulhw for it (MULS16 mode).
+/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
+/// generate pmullw+pmulhuw for it (MULU16 mode).
+static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // pmulld is supported since SSE41. It is better to use pmulld
+ // instead of pmullw+pmulhw.
+ if (Subtarget.hasSSE41())
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(N, DAG, Mode))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getOperand(0).getValueType();
+ unsigned RegSize = 128;
+ MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
+ EVT ReducedVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
+ // Shrink the operands of mul.
+ SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
+ SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
+
+ if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == MULU8 || Mode == MULS8) {
+ return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
+ } else {
+ MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
+ SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+ ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
+ ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+ }
+ } else {
+ // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
+ // to legalize the mul explicitly because implicit legalization for type
+ // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
+ // instructions which will not exist when we explicitly legalize it by
+ // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
+ // <4 x i16> undef).
+ //
+ // Legalize the operands of mul.
+ SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
+ DAG.getUNDEF(ReducedVT));
+ Ops[0] = NewN0;
+ NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+ Ops[0] = NewN1;
+ NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+
+ if (Mode == MULU8 || Mode == MULS8) {
+ // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
+ // part is needed.
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+
+ // convert the type of mul result to VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
+ : ISD::SIGN_EXTEND_VECTOR_INREG,
+ DL, ResVT, Mul);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ } else {
+ // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
+ // MULU16/MULS16, both parts are needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+ SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ OpsVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result. Make sure the type of mul result is VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
+ Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+}
+
+/// Optimize a single multiply with constant into two operations in order to
+/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
+static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (DCI.isBeforeLegalize() && VT.isVector())
+ return reduceVMULWidth(N, DAG, Subtarget);
+
// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
@@ -25230,7 +27611,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
- EVT VT = N->getValueType(0);
if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
@@ -25307,7 +27687,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
@@ -25320,7 +27700,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
- APInt ShAmt = N1C->getAPIntValue();
+ const APInt &ShAmt = N1C->getAPIntValue();
Mask = Mask.shl(ShAmt);
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
@@ -25367,7 +27747,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
@@ -25424,11 +27804,11 @@ static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
- (!Subtarget->hasInt256() ||
+ (!Subtarget.hasInt256() ||
(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
return SDValue();
@@ -25436,7 +27816,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
- APInt ShiftAmt = AmtSplat->getAPIntValue();
+ const APInt &ShiftAmt = AmtSplat->getAPIntValue();
unsigned MaxAmount =
VT.getSimpleVT().getVectorElementType().getSizeInBits();
@@ -25451,16 +27831,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformShiftCombine - Combine shifts.
-static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
if (N->getOpcode() == ISD::SHL)
- if (SDValue V = PerformSHLCombine(N, DAG))
+ if (SDValue V = combineShiftLeft(N, DAG))
return V;
if (N->getOpcode() == ISD::SRA)
- if (SDValue V = PerformSRACombine(N, DAG))
+ if (SDValue V = combineShiftRightAlgebraic(N, DAG))
return V;
// Try to fold this logical shift into a zero vector.
@@ -25471,17 +27850,17 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
-// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
-// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
-// and friends. Likewise for OR -> CMPNEQSS.
-static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
+/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
+/// OR -> CMPNEQSS.
+static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
unsigned opcode;
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
- if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0->getOperand(1);
@@ -25530,7 +27909,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
- if (Subtarget->hasAVX512()) {
+ if (Subtarget.hasAVX512()) {
SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
@@ -25547,7 +27926,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
- if (is64BitFP && !Subtarget->is64Bit()) {
+ if (is64BitFP && !Subtarget.is64Bit()) {
// On a 32-bit target, we cannot bitcast the 64-bit float to a
// 64-bit integer, since that's not a legal type. Since
// OnesOrZeroesF is all ones of all zeroes, we don't need all the
@@ -25574,34 +27953,47 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
-/// so it can be folded inside ANDNP.
-static bool CanFoldXORWithAllOnes(const SDNode *N) {
+/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
+static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::AND);
+
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
- // Match direct AllOnes for 128 and 256-bit vectors
- if (ISD::isBuildVectorAllOnes(N))
- return true;
+ if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
+ VT != MVT::v8i64 && VT != MVT::v16i32 &&
+ VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
+ return SDValue();
- // Look through a bit convert.
- if (N->getOpcode() == ISD::BITCAST)
- N = N->getOperand(0).getNode();
-
- // Sometimes the operand may come from a insert_subvector building a 256-bit
- // allones vector
- if (VT.is256BitVector() &&
- N->getOpcode() == ISD::INSERT_SUBVECTOR) {
- SDValue V1 = N->getOperand(0);
- SDValue V2 = N->getOperand(1);
-
- if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
- V1.getOperand(0).getOpcode() == ISD::UNDEF &&
- ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
- ISD::isBuildVectorAllOnes(V2.getNode()))
- return true;
- }
+ // Canonicalize XOR to the left.
+ if (N1.getOpcode() == ISD::XOR)
+ std::swap(N0, N1);
- return false;
+ if (N0.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+
+ N01 = peekThroughBitcasts(N01);
+
+ // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
+ // insert_subvector building a 256-bit AllOnes vector.
+ if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
+ if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
+ return SDValue();
+
+ SDValue V1 = N01->getOperand(0);
+ SDValue V2 = N01->getOperand(1);
+ if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
+ !V1.getOperand(0).isUndef() ||
+ !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
+ !ISD::isBuildVectorAllOnes(V2.getNode()))
+ return SDValue();
+ }
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
}
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -25610,7 +28002,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
// some of the transition sequences.
static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.is256BitVector())
return SDValue();
@@ -25660,8 +28052,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
if (RHSConstSplat) {
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
SDValue(RHSConstSplat, 0));
- SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
- N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
+ N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
} else if (RHSTrunc) {
N1 = N1->getOperand(0);
}
@@ -25687,9 +28078,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
}
}
-static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
@@ -25705,8 +28096,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
// The other side of the AND should be a splat of 2^C, where C
// is the number of bits in the source type.
- if (N1.getOpcode() == ISD::BITCAST)
- N1 = N1.getOperand(0);
+ N1 = peekThroughBitcasts(N1);
if (N1.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
@@ -25715,10 +28105,11 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
EVT SrcType = Shuffle->getValueType(0);
// We expect a single-source shuffle
- if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+ if (!Shuffle->getOperand(1)->isUndef())
return SDValue();
unsigned SrcSize = SrcType.getScalarSizeInBits();
+ unsigned NumElems = SrcType.getVectorNumElements();
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
@@ -25742,7 +28133,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
// the source and dest type.
unsigned ZextRatio = ResSize / SrcSize;
bool IsZext = true;
- for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+ for (unsigned i = 0; i != NumElems; ++i) {
if (i % ZextRatio) {
if (Shuffle->getMaskElt(i) > 0) {
// Expected undef
@@ -25765,8 +28156,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
// a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
// (instead of undef) where the k elements come from the zero vector.
SmallVector<int, 8> Mask;
- unsigned NumElems = SrcType.getVectorNumElements();
- for (unsigned i = 0; i < NumElems; ++i)
+ for (unsigned i = 0; i != NumElems; ++i)
if (i % ZextRatio)
Mask.push_back(NumElems);
else
@@ -25781,7 +28171,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
unsigned FPOpcode = ISD::DELETED_NODE;
if (N->getOpcode() == ISD::AND)
FPOpcode = X86ISD::FAND;
@@ -25798,8 +28188,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- ((Subtarget->hasSSE1() && VT == MVT::i32) ||
- (Subtarget->hasSSE2() && VT == MVT::i64))) {
+ ((Subtarget.hasSSE1() && VT == MVT::i32) ||
+ (Subtarget.hasSSE2() && VT == MVT::i64))) {
SDValue N00 = N0.getOperand(0);
SDValue N10 = N1.getOperand(0);
EVT N00Type = N00.getValueType();
@@ -25812,21 +28202,63 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
+/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
+/// eliminate loading the vector constant mask value. This relies on the fact
+/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
+static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+
+ // TODO: Use AssertSext to mark any nodes that have the property of producing
+ // all-ones or all-zeros. Then check for that node rather than particular
+ // opcodes.
+ if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+ return SDValue();
+
+ // The existence of the PCMP node guarantees that we have the required SSE2 or
+ // AVX2 for a shift of this vector type, but there is no vector shift by
+ // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
+ // masked compare nodes, so they should not make it here.
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
+ unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
+ if (VT0 != VT1 || EltBitWidth == 8)
+ return SDValue();
+
+ assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+ SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+ return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
+static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
- if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
+ if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
return Zext;
- if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
+ if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+ return R;
+
+ if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+ return ShiftRight;
+
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -25834,143 +28266,176 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
// Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
- if (VT == MVT::i32 || VT == MVT::i64) {
- // Check for BEXTR.
- if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
- (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
- ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
- ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (MaskNode && ShiftNode) {
- uint64_t Mask = MaskNode->getZExtValue();
- uint64_t Shift = ShiftNode->getZExtValue();
- if (isMask_64(Mask)) {
- uint64_t MaskSize = countPopulation(Mask);
- if (Shift + MaskSize <= VT.getSizeInBits())
- return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
- DAG.getConstant(Shift | (MaskSize << 8), DL,
- VT));
- }
- }
- } // BEXTR
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+ if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
return SDValue();
+ if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (MaskNode && ShiftNode) {
+ uint64_t Mask = MaskNode->getZExtValue();
+ uint64_t Shift = ShiftNode->getZExtValue();
+ if (isMask_64(Mask)) {
+ uint64_t MaskSize = countPopulation(Mask);
+ if (Shift + MaskSize <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+ DAG.getConstant(Shift | (MaskSize << 8), DL,
+ VT));
+ }
}
+ return SDValue();
+}
- // Want to form ANDNP nodes:
- // 1) In the hopes of then easily combining them with OR and AND nodes
- // to form PBLEND/PSIGN.
- // 2) To match ANDN packed intrinsics
- if (VT != MVT::v2i64 && VT != MVT::v4i64)
+// Try to fold:
+// (or (and (m, y), (pandn m, x)))
+// into:
+// (vselect m, x, y)
+// As a special case, try to fold:
+// (or (and (m, (sub 0, x)), (pandn m, x)))
+// into:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+ return SDValue();
+ assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
+
+ // Canonicalize pandn to RHS
+ if (N0.getOpcode() == X86ISD::ANDNP)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return SDValue();
- // Check LHS for vnot
- if (N0.getOpcode() == ISD::XOR &&
- //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
- CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+ SDValue Mask = N1.getOperand(0);
+ SDValue X = N1.getOperand(1);
+ SDValue Y;
+ if (N0.getOperand(0) == Mask)
+ Y = N0.getOperand(1);
+ if (N0.getOperand(1) == Mask)
+ Y = N0.getOperand(0);
- // Check RHS for vnot
- if (N1.getOpcode() == ISD::XOR &&
- //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
- CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+ // Check to see if the mask appeared in both the AND and ANDNP.
+ if (!Y.getNode())
+ return SDValue();
- return SDValue();
+ // Validate that X, Y, and Mask are bitcasts, and see through them.
+ Mask = peekThroughBitcasts(Mask);
+ X = peekThroughBitcasts(X);
+ Y = peekThroughBitcasts(Y);
+
+ EVT MaskVT = Mask.getValueType();
+
+ // Validate that the Mask operand is a vector sra node.
+ // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+ // there is no psrai.b
+ unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ unsigned SraAmt = ~0;
+ if (Mask.getOpcode() == ISD::SRA) {
+ if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+ if (auto *AmtConst = AmtBV->getConstantSplatNode())
+ SraAmt = AmtConst->getZExtValue();
+ } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+ SDValue SraC = Mask.getOperand(1);
+ SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ }
+ if ((SraAmt + 1) != EltBits)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Try to match:
+ // (or (and (M, (sub 0, X)), (pandn M, X)))
+ // which is a special case of vselect:
+ // (vselect M, (sub 0, X), X)
+ // Per:
+ // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+ // We know that, if fNegate is 0 or 1:
+ // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+ //
+ // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+ // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+ // ( M ? -X : X) == ((X ^ M ) + (M & 1))
+ // This lets us transform our vselect to:
+ // (add (xor X, M), (and M, 1))
+ // And further to:
+ // (sub (xor X, M), M)
+ if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+
+ if (V) {
+ assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+ }
+ }
+
+ // PBLENDVB is only available on SSE 4.1.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+ X = DAG.getBitcast(BlendVT, X);
+ Y = DAG.getBitcast(BlendVT, Y);
+ Mask = DAG.getBitcast(BlendVT, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ return DAG.getBitcast(VT, Mask);
}
-static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
- if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+ return R;
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
- // look for psign/blend
- if (VT == MVT::v2i64 || VT == MVT::v4i64) {
- if (!Subtarget->hasSSSE3() ||
- (VT == MVT::v4i64 && !Subtarget->hasInt256()))
- return SDValue();
-
- // Canonicalize pandn to RHS
- if (N0.getOpcode() == X86ISD::ANDNP)
- std::swap(N0, N1);
- // or (and (m, y), (pandn m, x))
- if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
- SDValue Mask = N1.getOperand(0);
- SDValue X = N1.getOperand(1);
- SDValue Y;
- if (N0.getOperand(0) == Mask)
- Y = N0.getOperand(1);
- if (N0.getOperand(1) == Mask)
- Y = N0.getOperand(0);
-
- // Check to see if the mask appeared in both the AND and ANDNP and
- if (!Y.getNode())
- return SDValue();
-
- // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
- // Look through mask bitcast.
- if (Mask.getOpcode() == ISD::BITCAST)
- Mask = Mask.getOperand(0);
- if (X.getOpcode() == ISD::BITCAST)
- X = X.getOperand(0);
- if (Y.getOpcode() == ISD::BITCAST)
- Y = Y.getOperand(0);
-
- EVT MaskVT = Mask.getValueType();
-
- // Validate that the Mask operand is a vector sra node.
- // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
- // there is no psrai.b
- unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
- unsigned SraAmt = ~0;
- if (Mask.getOpcode() == ISD::SRA) {
- if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
- if (auto *AmtConst = AmtBV->getConstantSplatNode())
- SraAmt = AmtConst->getZExtValue();
- } else if (Mask.getOpcode() == X86ISD::VSRAI) {
- SDValue SraC = Mask.getOperand(1);
- SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
- }
- if ((SraAmt + 1) != EltBits)
- return SDValue();
-
- SDLoc DL(N);
-
- // Now we know we at least have a plendvb with the mask val. See if
- // we can form a psignb/w/d.
- // psign = x.type == y.type == mask.type && y = sub(0, x);
- if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
- ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
- X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
- assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
- "Unsupported VT for PSIGN");
- Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
- return DAG.getBitcast(VT, Mask);
- }
- // PBLENDVB only available on SSE 4.1
- if (!Subtarget->hasSSE41())
- return SDValue();
-
- MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
-
- X = DAG.getBitcast(BlendVT, X);
- Y = DAG.getBitcast(BlendVT, Y);
- Mask = DAG.getBitcast(BlendVT, Mask);
- Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
- return DAG.getBitcast(VT, Mask);
- }
- }
-
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
@@ -25982,7 +28447,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// series of shifts/or that would otherwise be generated.
// Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
// have higher latencies and we are not optimizing for size.
- if (!OptForSize && Subtarget->isSHLDSlow())
+ if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
@@ -26040,7 +28505,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
}
// Generate NEG and CMOV for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
// Since X86 does not have CMOV for 8-bit integer, we don't convert
@@ -26073,13 +28538,14 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Try to turn tests against the signbit in the form of:
-// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
-// into:
-// SETGT(X, -1)
+/// Try to turn tests against the signbit in the form of:
+/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+/// into:
+/// SETGT(X, -1)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
- // This is only worth doing if the output type is i8.
- if (N->getValueType(0) != MVT::i8)
+ // This is only worth doing if the output type is i8 or i1.
+ EVT ResultType = N->getValueType(0);
+ if (ResultType != MVT::i8 && ResultType != MVT::i1)
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -26114,22 +28580,78 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue ShiftOp = Shift.getOperand(0);
EVT ShiftOpTy = ShiftOp.getValueType();
- SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), ResultType);
+ SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+ if (SetCCResultType != ResultType)
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
return Cond;
}
-static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+/// Turn vector tests of the signbit in the form of:
+/// xor (sra X, elt_size(X)-1), -1
+/// into:
+/// pcmpgt X, -1
+///
+/// This should be called before type legalization because the pattern may not
+/// persist after that.
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isSimple())
+ return SDValue();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
+ case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
+ }
+
+ // There must be a shift right algebraic before the xor, and the xor must be a
+ // 'not' operation.
+ SDValue Shift = N->getOperand(0);
+ SDValue Ones = N->getOperand(1);
+ if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
+ !ISD::isBuildVectorAllOnes(Ones.getNode()))
+ return SDValue();
+
+ // The shift should be smearing the sign bit across each vector element.
+ auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
+ if (!ShiftBV)
+ return SDValue();
+
+ EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+ auto *ShiftAmt = ShiftBV->getConstantSplatNode();
+ if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ return SDValue();
+
+ // Create a greater-than comparison against -1. We don't use the more obvious
+ // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
+ return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
return RV;
- if (Subtarget->hasCMov())
- if (SDValue RV = performIntegerAbsCombine(N, DAG))
+ if (Subtarget.hasCMov())
+ if (SDValue RV = combineIntegerAbs(N, DAG))
return RV;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
@@ -26142,7 +28664,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
- const X86Subtarget *Subtarget, SDLoc DL) {
+ const X86Subtarget &Subtarget,
+ const SDLoc &DL) {
if (!VT.isVector() || !VT.isSimple())
return SDValue();
EVT InVT = In.getValueType();
@@ -26159,10 +28682,12 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
return SDValue();
- if (Subtarget->hasAVX512()) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+ if (Subtarget.hasAVX512()) {
if (VT.getSizeInBits() > 512)
return SDValue();
- } else if (Subtarget->hasAVX2()) {
+ } else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256)
return SDValue();
} else {
@@ -26221,10 +28746,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
Operands[0].getOperand(0).getValueType() == VT) {
// The pattern is detected. Subtract one from the constant vector, then
// demote it and emit X86ISD::AVG instruction.
- SDValue One = DAG.getConstant(1, DL, InScalarVT);
- SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
- SmallVector<SDValue, 8>(NumElems, One));
- Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+ SDValue VecOnes = DAG.getConstant(1, DL, InVT);
+ Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
Operands[1]);
@@ -26258,10 +28781,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return SDValue();
}
-/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
-static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
@@ -26283,41 +28805,180 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
SDValue Ptr = Ld->getBasePtr();
- SDValue Increment =
- DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems/2);
- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Alignment);
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- std::min(16U, Alignment));
+ SDValue Load1 =
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ Alignment, Ld->getMemOperand()->getFlags());
+
+ Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
+ SDValue Load2 =
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1),
Load2.getValue(1));
SDValue NewVec = DAG.getUNDEF(RegVT);
- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+ NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
return DCI.CombineTo(N, NewVec, TF, true);
}
return SDValue();
}
-/// PerformMLOADCombine - Resolve extending loads
-static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// If V is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(SDValue V) {
+ // This needs to be a build vector of booleans.
+ // TODO: Checking for the i1 type matches the IR definition for the mask,
+ // but the mask check could be loosened to i8 or other types. That might
+ // also require checking more than 'allOnesValue'; eg, the x86 HW
+ // instructions only require that the MSB is set for each mask element.
+ // The ISD::MSTORE comments/definition do not specify how the mask operand
+ // is formatted.
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+ return -1;
+
+ int TrueIndex = -1;
+ unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ const SDValue &Op = BV->getOperand(i);
+ if (Op.isUndef())
+ continue;
+ auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+ if (!ConstNode)
+ return -1;
+ if (ConstNode->getAPIntValue().isAllOnesValue()) {
+ // If we already found a one, this is too many.
+ if (TrueIndex >= 0)
+ return -1;
+ TrueIndex = i;
+ }
+ }
+ return TrueIndex;
+}
+
+/// Given a masked memory load/store operation, return true if it has one mask
+/// bit set. If it has one mask bit set, then also return the memory address of
+/// the scalar element to load/store, the vector index to insert/extract that
+/// scalar element, and the alignment for the scalar memory access.
+static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
+ SelectionDAG &DAG, SDValue &Addr,
+ SDValue &Index, unsigned &Alignment) {
+ int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
+ if (TrueMaskElt < 0)
+ return false;
+
+ // Get the address of the one scalar element that is specified by the mask
+ // using the appropriate offset from the base pointer.
+ EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+ Addr = MaskedOp->getBasePtr();
+ if (TrueMaskElt != 0) {
+ unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
+ Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+ }
+
+ Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
+ Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+ return true;
+}
+
+/// If exactly one element of the mask is set for a non-extending masked load,
+/// it is a scalar load and vector insert.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue
+reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ unsigned Alignment;
+ if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+ return SDValue();
+
+ // Load the one scalar element that is specified by the mask using the
+ // appropriate offset from the base pointer.
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Load =
+ DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+ Alignment, ML->getMemOperand()->getFlags());
+
+ // Insert the loaded element into the appropriate place in the vector.
+ SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
+ Load, VecIndex);
+ return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
+}
+
+static SDValue
+combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
+ return SDValue();
+
+ SDLoc DL(ML);
+ EVT VT = ML->getValueType(0);
+
+ // If we are loading the first and last elements of a vector, it is safe and
+ // always faster to load the whole vector. Replace the masked load with a
+ // vector load and select.
+ unsigned NumElts = VT.getVectorNumElements();
+ BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
+ bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
+ bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
+ if (LoadFirstElt && LoadLastElt) {
+ SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+ ML->getMemOperand());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
+ return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
+ }
+
+ // Convert a masked load with a constant mask into a masked load and a select.
+ // This allows the select operation to use a faster kind of select instruction
+ // (for example, vblendvps -> vblendps).
+
+ // Don't try this if the pass-through operand is already undefined. That would
+ // cause an infinite loop because that's what we're about to create.
+ if (ML->getSrc0().isUndef())
+ return SDValue();
+
+ // The new masked load has an undef pass-through operand. The select uses the
+ // original pass-through operand.
+ SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+ ML->getMask(), DAG.getUNDEF(VT),
+ ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getExtensionType());
+ SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
+
+ return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
+}
+
+static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
+ if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+ return ScalarLoad;
+ // TODO: Do some AVX512 subsets benefit from this transform?
+ if (!Subtarget.hasAVX512())
+ if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
+ return Blend;
+ }
+
if (Mld->getExtensionType() != ISD::SEXTLOAD)
return SDValue();
+ // Resolve extending loads.
EVT VT = Mld->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
EVT LdVT = Mld->getMemoryVT();
@@ -26326,21 +28987,21 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
assert(LdVT != VT && "Cannot extend to the same type");
unsigned ToSz = VT.getVectorElementType().getSizeInBits();
unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
- // From, To sizes and ElemCount must be pow of two
+ // From/To sizes and ElemCount must be pow of two.
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for extending masked load");
unsigned SizeRatio = ToSz / FromSz;
assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
- // Create a type on which we perform the shuffle
+ // Create a type on which we perform the shuffle.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
LdVT.getScalarType(), NumElems*SizeRatio);
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- // Convert Src0 value
+ // Convert Src0 value.
SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
- if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ if (!Mld->getSrc0().isUndef()) {
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
@@ -26349,13 +29010,13 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal");
WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
- DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
}
- // Prepare the new mask
+ // Prepare the new mask.
SDValue NewMask;
SDValue Mask = Mld->getMask();
if (Mask.getValueType() == VT) {
- // Mask and original value have the same type
+ // Mask and original value have the same type.
NewMask = DAG.getBitcast(WideVecVT, Mask);
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
@@ -26364,9 +29025,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
ShuffleVec[i] = NumElems * SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
DAG.getConstant(0, dl, WideVecVT),
- &ShuffleVec[0]);
- }
- else {
+ ShuffleVec);
+ } else {
assert(Mask.getValueType().getVectorElementType() == MVT::i1);
unsigned WidenNumElts = NumElems*SizeRatio;
unsigned MaskNumElts = VT.getVectorNumElements();
@@ -26390,13 +29050,41 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
}
-/// PerformMSTORECombine - Resolve truncating stores
-static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+ SelectionDAG &DAG) {
+ // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+ // However, some target hooks may need to be added to know when the transform
+ // is profitable. Endianness would also have to be considered.
+
+ SDValue Addr, VecIndex;
+ unsigned Alignment;
+ if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+ return SDValue();
+
+ // Extract the one scalar element that is actually being stored.
+ SDLoc DL(MS);
+ EVT VT = MS->getValue().getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+ MS->getValue(), VecIndex);
+
+ // Store that element at the appropriate offset from the base pointer.
+ return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+ Alignment, MS->getMemOperand()->getFlags());
+}
+
+static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
if (!Mst->isTruncatingStore())
- return SDValue();
+ return reduceMaskedStoreToScalarStore(Mst, DAG);
+ // Resolve truncating stores.
EVT VT = Mst->getValue().getValueType();
unsigned NumElems = VT.getVectorNumElements();
EVT StVT = Mst->getMemoryVT();
@@ -26415,7 +29103,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
if (TLI.isTruncStoreLegal(VT, StVT))
return SDValue();
- // From, To sizes and ElemCount must be pow of two
+ // From/To sizes and ElemCount must be pow of two.
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for truncating masked store");
// We are going to use the original vector elt for storing.
@@ -26426,7 +29114,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
unsigned SizeRatio = FromSz / ToSz;
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
- // Create a type on which we perform the shuffle
+ // Create a type on which we perform the shuffle.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
StVT.getScalarType(), NumElems*SizeRatio);
@@ -26443,12 +29131,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
- &ShuffleVec[0]);
+ ShuffleVec);
SDValue NewMask;
SDValue Mask = Mst->getMask();
if (Mask.getValueType() == VT) {
- // Mask and original value have the same type
+ // Mask and original value have the same type.
NewMask = DAG.getBitcast(WideVecVT, Mask);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
@@ -26456,9 +29144,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
ShuffleVec[i] = NumElems*SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
DAG.getConstant(0, dl, WideVecVT),
- &ShuffleVec[0]);
- }
- else {
+ ShuffleVec);
+ } else {
assert(Mask.getValueType().getVectorElementType() == MVT::i1);
unsigned WidenNumElts = NumElems*SizeRatio;
unsigned MaskNumElts = VT.getVectorNumElements();
@@ -26479,9 +29166,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
Mst->getBasePtr(), NewMask, StVT,
Mst->getMemOperand(), false);
}
-/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
-static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+
+static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
@@ -26496,26 +29183,24 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
unsigned Alignment = St->getAlignment();
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AddressSpace, Alignment, &Fast) && !Fast) {
+ AddressSpace, Alignment, &Fast) &&
+ !Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
- SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+ SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
+ SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
- SDValue Stride =
- DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
-
- SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), Alignment);
- SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(),
- std::min(16U, Alignment));
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+
+ SDValue Ch0 =
+ DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
+ Alignment, St->getMemOperand()->getFlags());
+ SDValue Ch1 =
+ DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
+ std::min(16U, Alignment), St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
@@ -26526,12 +29211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// Check if we can detect an AVG pattern from the truncation. If yes,
// replace the trunc store by a normal store with the result of X86ISD::AVG
// instruction.
- SDValue Avg =
- detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
- if (Avg.getNode())
+ if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+ Subtarget, dl))
return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), St->getAlignment());
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
@@ -26543,7 +29227,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
// In this case we don't need any further transformations.
- if (TLI.isTruncStoreLegal(VT, StVT))
+ if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
return SDValue();
// From, To sizes and ElemCount must be pow of two
@@ -26573,7 +29257,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
- &ShuffleVec[0]);
+ ShuffleVec);
// At this point all of the data is stored at the bottom of the
// register. We now need to save it to mem.
@@ -26595,8 +29279,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
- SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
- TLI.getPointerTy(DAG.getDataLayout()));
SDValue Ptr = St->getBasePtr();
// Perform one or more big stores into memory.
@@ -26604,10 +29286,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
StoreType, ShuffWide,
DAG.getIntPtrConstant(i, dl));
- SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), St->getAlignment());
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ SDValue Ch =
+ DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
Chains.push_back(Ch);
}
@@ -26626,9 +29308,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
- !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
if ((VT.isVector() ||
- (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+ (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
@@ -26667,58 +29349,49 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// If we are a 64-bit capable x86, lower to a single movq load/store pair.
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
- if (Subtarget->is64Bit() || F64IsLegal) {
- MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ if (Subtarget.is64Bit() || F64IsLegal) {
+ MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
SDValue NewChain = NewLd.getValue(1);
- if (TokenFactorIndex != -1) {
+ if (TokenFactorIndex >= 0) {
Ops.push_back(NewChain);
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
- St->getPointerInfo(),
- St->isVolatile(), St->isNonTemporal(),
- St->getAlignment());
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
}
// Otherwise, lower to two pairs of 32-bit loads / stores.
SDValue LoAddr = Ld->getBasePtr();
- SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
- DAG.getConstant(4, LdDL, MVT::i32));
+ SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
- Ld->getPointerInfo(),
- Ld->isVolatile(), Ld->isNonTemporal(),
- Ld->isInvariant(), Ld->getAlignment());
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
Ld->getPointerInfo().getWithOffset(4),
- Ld->isVolatile(), Ld->isNonTemporal(),
- Ld->isInvariant(),
- MinAlign(Ld->getAlignment(), 4));
+ MinAlign(Ld->getAlignment(), 4),
+ Ld->getMemOperand()->getFlags());
SDValue NewChain = LoLd.getValue(1);
- if (TokenFactorIndex != -1) {
+ if (TokenFactorIndex >= 0) {
Ops.push_back(LoLd);
Ops.push_back(HiLd);
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
LoAddr = St->getBasePtr();
- HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
- DAG.getConstant(4, StDL, MVT::i32));
-
- SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
- St->getPointerInfo(),
- St->isVolatile(), St->isNonTemporal(),
- St->getAlignment());
- SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
- St->getPointerInfo().getWithOffset(4),
- St->isVolatile(),
- St->isNonTemporal(),
- MinAlign(St->getAlignment(), 4));
+ HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
+
+ SDValue LoSt =
+ DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ SDValue HiSt = DAG.getStore(
+ NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
+ MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
}
@@ -26728,7 +29401,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// to get past legalization. The execution dependencies fixup pass will
// choose the optimal machine instruction for the store if this really is
// an integer or v2f32 rather than an f64.
- if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
+ if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue OldExtract = St->getOperand(1);
SDValue ExtOp0 = OldExtract.getOperand(0);
@@ -26738,8 +29411,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
BitCast, OldExtract.getOperand(1));
return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
- St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), St->getAlignment());
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
}
return SDValue();
@@ -26798,14 +29471,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
SDValue A, B;
SmallVector<int, 16> LMask(NumElts);
if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ if (!LHS.getOperand(0).isUndef())
A = LHS.getOperand(0);
- if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ if (!LHS.getOperand(1).isUndef())
B = LHS.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
std::copy(Mask.begin(), Mask.end(), LMask.begin());
} else {
- if (LHS.getOpcode() != ISD::UNDEF)
+ if (!LHS.isUndef())
A = LHS;
for (unsigned i = 0; i != NumElts; ++i)
LMask[i] = i;
@@ -26816,14 +29489,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
SDValue C, D;
SmallVector<int, 16> RMask(NumElts);
if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ if (!RHS.getOperand(0).isUndef())
C = RHS.getOperand(0);
- if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ if (!RHS.getOperand(1).isUndef())
D = RHS.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
std::copy(Mask.begin(), Mask.end(), RMask.begin());
} else {
- if (RHS.getOpcode() != ISD::UNDEF)
+ if (!RHS.isUndef())
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
RMask[i] = i;
@@ -26871,33 +29544,22 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
return true;
}
-/// Do target-specific dag combines on floating point adds.
-static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
- // Try to synthesize horizontal adds from adds of shuffles.
- if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
- (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, true))
- return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
- return SDValue();
-}
-
-/// Do target-specific dag combines on floating point subs.
-static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+/// Do target-specific dag combines on floating-point adds/subs.
+static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ bool IsFadd = N->getOpcode() == ISD::FADD;
+ assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
- // Try to synthesize horizontal subs from subs of shuffles.
- if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
- (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, false))
- return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
+ // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+ if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+ isHorizontalBinOp(LHS, RHS, IsFadd)) {
+ auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
+ return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+ }
return SDValue();
}
@@ -26916,13 +29578,11 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
// First, use mask to unset all bits that won't appear in the result.
assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
"OutSVT can only be either i8 or i16.");
- SDValue MaskVal =
- DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
- SDValue MaskVec = DAG.getNode(
- ISD::BUILD_VECTOR, DL, InVT,
- SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+ APInt Mask =
+ APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
+ SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
for (auto &Reg : Regs)
- Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+ Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
MVT UnpackedVT, PackedVT;
if (OutSVT == MVT::i8) {
@@ -26938,7 +29598,7 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
j < e; j *= 2, RegNum /= 2) {
for (unsigned i = 0; i < RegNum; i++)
- Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+ Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
for (unsigned i = 0; i < RegNum / 2; i++)
Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
Regs[i * 2 + 1]);
@@ -26990,7 +29650,7 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
/// element that is extracted from a vector and then truncated, and it is
/// diffcult to do this optimization based on them.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
EVT OutVT = N->getValueType(0);
if (!OutVT.isVector())
return SDValue();
@@ -27005,7 +29665,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
// SSE2, and we need to take care of it specially.
// AVX512 provides vpmovdb.
- if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
return SDValue();
EVT OutSVT = OutVT.getVectorElementType();
@@ -27016,7 +29676,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
- if (Subtarget->hasSSSE3() && NumElems == 8 &&
+ if (Subtarget.hasSSSE3() && NumElems == 8 &&
((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
(InSVT == MVT::i32 && OutSVT == MVT::i16)))
return SDValue();
@@ -27026,20 +29686,17 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
// Split a long vector into vectors of legal type.
unsigned RegNum = InVT.getSizeInBits() / 128;
SmallVector<SDValue, 8> SubVec(RegNum);
- if (InSVT == MVT::i32) {
- for (unsigned i = 0; i < RegNum; i++)
- SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(i * 4, DL));
- } else {
- for (unsigned i = 0; i < RegNum; i++)
- SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(i * 2, DL));
- }
+ unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+ EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
- // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+ DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
- if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+ if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
else if (InSVT == MVT::i32)
return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
@@ -27047,20 +29704,30 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
+
// Try to detect AVG pattern first.
- SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
- Subtarget, SDLoc(N));
- if (Avg.getNode())
+ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
+ // The bitcast source is a direct mmx result.
+ // Detect bitcasts between i32 to x86mmx
+ if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
+ SDValue BCSrc = Src.getOperand(0);
+ if (BCSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
+ }
+
return combineVectorTruncation(N, DAG, Subtarget);
}
/// Do target-specific dag combines on floating point negations.
-static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
SDValue Arg = N->getOperand(0);
@@ -27074,7 +29741,7 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
// use of a constant by performing (-0 - A*B) instead.
// FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
- Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+ Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Zero);
@@ -27102,17 +29769,17 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
}
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
- if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+ if (VT.is512BitVector() && !Subtarget.hasDQI()) {
// VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
// These logic operations may be executed in the integer domain.
SDLoc dl(N);
MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
- SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
- SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+ SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
unsigned IntOpcode = 0;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected FP logic op");
@@ -27122,13 +29789,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
}
SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
- return DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+ return DAG.getBitcast(VT, IntOp);
}
return SDValue();
}
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
-static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
@@ -27145,7 +29812,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
}
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
-static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
// Only perform optimizations if UnsafeMath is used.
@@ -27165,9 +29832,9 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
N->getOperand(0), N->getOperand(1));
}
-static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- if (Subtarget->useSoftFloat())
+static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat())
return SDValue();
// TODO: Check for global or instruction-level "nnan". In that case, we
@@ -27176,9 +29843,9 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
// should be an optional swap and FMAX/FMIN.
EVT VT = N->getValueType(0);
- if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
- (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
return SDValue();
// This takes at least 3 instructions, so favor a library call when operating
@@ -27222,8 +29889,8 @@ static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
}
/// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// FAND(0.0, x) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
@@ -27238,8 +29905,8 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
@@ -27253,9 +29920,8 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
return lowerX86FPLogicOp(N, DAG, Subtarget);
}
-static SDValue PerformBTCombine(SDNode *N,
- SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
// BT ignores high bits in the bit index operand.
SDValue Op1 = N->getOperand(1);
if (Op1.hasOneUse()) {
@@ -27272,21 +29938,19 @@ static SDValue PerformBTCombine(SDNode *N,
return SDValue();
}
-static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
- SDValue Op = N->getOperand(0);
- if (Op.getOpcode() == ISD::BITCAST)
- Op = Op.getOperand(0);
+static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op = peekThroughBitcasts(N->getOperand(0));
EVT VT = N->getValueType(0), OpVT = Op.getValueType();
if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
VT.getVectorElementType().getSizeInBits() ==
OpVT.getVectorElementType().getSizeInBits()) {
- return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+ return DAG.getBitcast(VT, Op);
}
return SDValue();
}
-static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
@@ -27307,7 +29971,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
// EXTLOAD has a better solution on AVX2,
// it may be replaced with X86ISD::VSEXT node.
- if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+ if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
@@ -27325,7 +29989,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
/// to combine math ops, use an LEA, or use a complex addressing mode. This can
/// eliminate extend, add, and shift instructions.
static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ const X86Subtarget &Subtarget) {
// TODO: This should be valid for other integer types.
EVT VT = Sext->getValueType(0);
if (VT != MVT::i64)
@@ -27397,14 +30061,106 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
return R.getValue(1);
}
-static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
+/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
+/// with UNDEFs) of the input to vectors of the same size as the target type
+/// which then extends the lowest elements.
+static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT InVT = N0.getValueType();
EVT InSVT = InVT.getScalarType();
+
+ // Input type must be a vector and we must be extending legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ // On AVX2+ targets, if the input/output types are both legal then we will be
+ // able to use SIGN_EXTEND/ZERO_EXTEND directly.
+ if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
+ EVT InVT = N.getValueType();
+ EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+ Size / InVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+ DAG.getUNDEF(InVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ };
+
+ // If target-size is less than 128-bits, extend to a type that would extend
+ // to 128 bits, extend that and extract the original target vector.
+ if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
+ unsigned Scale = 128 / VT.getSizeInBits();
+ EVT ExVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+ SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+ SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
+ // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
+ // Also use this if we don't have SSE41 to allow the legalizer do its job.
+ if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget.hasInt256())) {
+ SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
+ return Opcode == ISD::SIGN_EXTEND
+ ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
+ : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+ }
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendVecSize(DL, SrcVec, 128);
+ SrcVec = Opcode == ISD::SIGN_EXTEND
+ ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
+ : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT InVT = N0.getValueType();
SDLoc DL(N);
if (SDValue DivRem8 = getDivRem8(N, DAG))
@@ -27414,70 +30170,16 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
- if (VT.isVector() && Subtarget->hasSSE2()) {
- auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
- EVT InVT = N.getValueType();
- EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
- Size / InVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
- DAG.getUNDEF(InVT));
- Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
- };
-
- // If target-size is less than 128-bits, extend to a type that would extend
- // to 128 bits, extend that and extract the original target vector.
- if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
- (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
- (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
- unsigned Scale = 128 / VT.getSizeInBits();
- EVT ExVT =
- EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
- SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
- SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
- // which ensures lowering to X86ISD::VSEXT (pmovsx*).
- if (VT.getSizeInBits() == 128 &&
- (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
- (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
- SDValue ExOp = ExtendVecSize(DL, N0, 128);
- return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
- }
-
- // On pre-AVX2 targets, split into 128-bit nodes of
- // ISD::SIGN_EXTEND_VECTOR_INREG.
- if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
- (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
- (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
- unsigned NumVecs = VT.getSizeInBits() / 128;
- unsigned NumSubElts = 128 / SVT.getSizeInBits();
- EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
- EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
-
- SmallVector<SDValue, 8> Opnds;
- for (unsigned i = 0, Offset = 0; i != NumVecs;
- ++i, Offset += NumSubElts) {
- SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
- DAG.getIntPtrConstant(Offset, DL));
- SrcVec = ExtendVecSize(DL, SrcVec, 128);
- SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
- Opnds.push_back(SrcVec);
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
- }
- }
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
- if (Subtarget->hasAVX() && VT.is256BitVector())
+ if (Subtarget.hasAVX() && VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
@@ -27487,8 +30189,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget* Subtarget) {
+static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
@@ -27497,7 +30199,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT ScalarVT = VT.getScalarType();
- if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
SDValue A = N->getOperand(0);
@@ -27526,9 +30228,9 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, dl, VT, A, B, C);
}
-static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)
// This eliminates the zext. This transformation is necessary because
@@ -27563,6 +30265,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
if (VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
@@ -27573,10 +30278,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Optimize x == -y --> x+y == 0
-// x != -y --> x+y != 0
-static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget* Subtarget) {
+/// Optimize x == -y --> x+y == 0
+/// x != -y --> x+y != 0
+static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -27631,10 +30336,15 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
+ // via legalization because v4i32 is not a legal type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
+ return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
return SDValue();
}
-static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
// Gather and Scatter instructions use k-registers for masks. The type of
// the masks is v*i1. So the mask will be truncated anyway.
@@ -27648,11 +30358,11 @@ static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// Helper function of performSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
- MVT VT) {
+static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
+ SelectionDAG &DAG, MVT VT) {
if (VT == MVT::i8)
return DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
@@ -27667,9 +30377,9 @@ static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
@@ -27698,7 +30408,8 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
if (CC == X86::COND_B)
return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
- if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+ // Try to simplify the EFLAGS and condition code operands.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
}
@@ -27706,28 +30417,28 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Optimize branch condition evaluation.
-//
-static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// Optimize branch condition evaluation.
+static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- SDValue Dest = N->getOperand(1);
SDValue EFLAGS = N->getOperand(3);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
- if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+ // Try to simplify the EFLAGS and condition code operands.
+ // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+ // RAUW them under us.
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
- return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
- Flags);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+ N->getOperand(1), Cond, Flags);
}
return SDValue();
}
-static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
- SelectionDAG &DAG) {
+static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
+ SelectionDAG &DAG) {
// Take advantage of vector comparisons producing 0 or -1 in each lane to
// optimize away operation when it's from a constant.
//
@@ -27772,8 +30483,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
return SDValue();
}
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
@@ -27797,11 +30508,11 @@ static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
- if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+ if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
return Res;
// Now move on to more general possibilities.
@@ -27822,18 +30533,18 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
- if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+ if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
EVT LdVT = Ld->getValueType(0);
- // This transformation is not supported if the result type is f16
- if (VT == MVT::f16)
+ // This transformation is not supported if the result type is f16 or f128.
+ if (VT == MVT::f16 || VT == MVT::f128)
return SDValue();
if (!Ld->isVolatile() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !Subtarget->is64Bit() && LdVT == MVT::i64) {
- SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ !Subtarget.is64Bit() && LdVT == MVT::i64) {
+ SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
@@ -27843,8 +30554,8 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
}
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
-static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
- X86TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
+ X86TargetLowering::DAGCombinerInfo &DCI) {
// If the LHS and RHS of the ADC node are zero, then it can't overflow and
// the result is either zero or one (depending on the input carry bit).
// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
@@ -27868,10 +30579,10 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// fold (add Y, (sete X, 0)) -> adc 0, Y
-// (add Y, (setne X, 0)) -> sbb -1, Y
-// (sub (sete X, 0), Y) -> sbb 0, Y
-// (sub (setne X, 0), Y) -> adc -1, Y
+/// fold (add Y, (sete X, 0)) -> adc 0, Y
+/// (add Y, (setne X, 0)) -> sbb -1, Y
+/// (sub (sete X, 0), Y) -> sbb 0, Y
+/// (sub (setne X, 0), Y) -> adc -1, Y
static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
@@ -27909,24 +30620,163 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
}
-/// PerformADDCombine - Do target-specific dag combines on integer adds.
-static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (!VT.isVector() || !VT.isSimple() ||
+ !(VT.getVectorElementType() == MVT::i32))
+ return SDValue();
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+
+ // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ if (VT.getSizeInBits() / 4 > RegSize)
+ return SDValue();
+
+ // Detect the following pattern:
+ //
+ // 1: %2 = zext <N x i8> %0 to <N x i32>
+ // 2: %3 = zext <N x i8> %1 to <N x i32>
+ // 3: %4 = sub nsw <N x i32> %2, %3
+ // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+ // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
+ // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+ // 7: %8 = add nsw <N x i32> %7, %vec.phi
+ //
+ // The last instruction must be a reduction add. The instructions 3-6 forms an
+ // ABSDIFF pattern.
+
+ // The two operands of reduction add are from PHI and a select-op as in line 7
+ // above.
+ SDValue SelectOp, Phi;
+ if (Op0.getOpcode() == ISD::VSELECT) {
+ SelectOp = Op0;
+ Phi = Op1;
+ } else if (Op1.getOpcode() == ISD::VSELECT) {
+ SelectOp = Op1;
+ Phi = Op0;
+ } else
+ return SDValue();
+
+ // Check the condition of the select instruction is greater-than.
+ SDValue SetCC = SelectOp->getOperand(0);
+ if (SetCC.getOpcode() != ISD::SETCC)
+ return SDValue();
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ if (CC != ISD::SETGT)
+ return SDValue();
+
+ Op0 = SelectOp->getOperand(1);
+ Op1 = SelectOp->getOperand(2);
+
+ // The second operand of SelectOp Op1 is the negation of the first operand
+ // Op0, which is implemented as 0 - Op0.
+ if (!(Op1.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
+ Op1.getOperand(1) == Op0))
+ return SDValue();
+
+ // The first operand of SetCC is the first operand of SelectOp, which is the
+ // difference between two input vectors.
+ if (SetCC.getOperand(0) != Op0)
+ return SDValue();
+
+ // The second operand of > comparison can be either -1 or 0.
+ if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+ ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+ return SDValue();
+
+ // The first operand of SelectOp is the difference between two input vectors.
+ if (Op0.getOpcode() != ISD::SUB)
+ return SDValue();
+
+ Op1 = Op0.getOperand(1);
+ Op0 = Op0.getOperand(0);
+
+ // Check if the operands of the diff are zero-extended from vectors of i8.
+ if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+ Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+ Op1.getOpcode() != ISD::ZERO_EXTEND ||
+ Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+ return SDValue();
+
+ // SAD pattern detected. Now build a SAD instruction and an addition for
+ // reduction. Note that the number of elments of the result of SAD is less
+ // than the number of elements of its input. Therefore, we could only update
+ // part of elements in the reduction vector.
+
+ // Legalize the type of the inputs of PSADBW.
+ EVT InVT = Op0.getOperand(0).getValueType();
+ if (InVT.getSizeInBits() <= 128)
+ RegSize = 128;
+ else if (InVT.getSizeInBits() <= 256)
+ RegSize = 256;
+
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+ Ops[0] = Op0.getOperand(0);
+ MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+ Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+ Ops[0] = Op1.getOperand(0);
+ Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+ // The output of PSADBW is a vector of i64.
+ MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+ SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
+
+ // We need to turn the vector of i64 into a vector of i32.
+ // If the reduction vector is at least as wide as the psadbw result, just
+ // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
+ // anyway.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ if (VT.getSizeInBits() >= ResVT.getSizeInBits())
+ Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+ else
+ Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
+
+ if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+ // Update part of elements of the reduction vector. This is done by first
+ // extracting a sub-vector from it, updating this sub-vector, and inserting
+ // it back.
+ SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
+ DAG.getIntPtrConstant(0, DL));
+ } else
+ return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+}
+
+static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+ if (Flags->hasVectorReduction()) {
+ if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
+ return Sad;
+ }
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
// Try to synthesize horizontal adds from adds of shuffles.
- if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
- (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
return OptimizeConditionalInDecrement(N, DAG);
}
-static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -27950,30 +30800,44 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal adds from adds of shuffles.
EVT VT = N->getValueType(0);
- if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
- (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+ if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
return OptimizeConditionalInDecrement(N, DAG);
}
-/// performVZEXTCombine - Performs build vector combines
-static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
MVT VT = N->getSimpleValueType(0);
+ MVT SVT = VT.getVectorElementType();
SDValue Op = N->getOperand(0);
MVT OpVT = Op.getSimpleValueType();
MVT OpEltVT = OpVT.getVectorElementType();
unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
- // (vzext (bitcast (vzext (x)) -> (vzext x)
- SDValue V = Op;
- while (V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
+ // Perform any constant folding.
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ SmallVector<SDValue, 4> Vals;
+ for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ SDValue OpElt = Op.getOperand(i);
+ if (OpElt.getOpcode() == ISD::UNDEF) {
+ Vals.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
+ assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
+ Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
+ Vals.push_back(DAG.getConstant(Cst, DL, SVT));
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
+ }
+ // (vzext (bitcast (vzext (x)) -> (vzext x)
+ SDValue V = peekThroughBitcasts(Op);
if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
MVT InnerVT = V.getSimpleValueType();
MVT InnerEltVT = InnerVT.getVectorElementType();
@@ -28022,61 +30886,111 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
+static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Chain = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ MVT VT = RHS.getSimpleValueType();
+ SDLoc DL(N);
+
+ auto *C = dyn_cast<ConstantSDNode>(RHS);
+ if (!C || C->getZExtValue() != 1)
+ return SDValue();
+
+ RHS = DAG.getConstant(-1, DL, VT);
+ MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+ return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, LHS, RHS}, VT, MMO);
+}
+
+// TEST (AND a, b) ,(AND a, b) -> TEST a, b
+static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ return DAG.getNode(X86ISD::TESTM, DL, VT,
+ Op0->getOperand(0), Op0->getOperand(1));
+}
+
+static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ SDLoc DL(N);
+
+ if (N->getOperand(0) == N->getOperand(1)) {
+ if (N->getOpcode() == X86ISD::PCMPEQ)
+ return getOnesVector(VT, Subtarget, DAG, DL);
+ if (N->getOpcode() == X86ISD::PCMPGT)
+ return getZeroVector(VT, Subtarget, DAG, DL);
+ }
+
+ return SDValue();
+}
+
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default: break;
- case ISD::EXTRACT_VECTOR_ELT:
- return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+ case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
case ISD::VSELECT:
case ISD::SELECT:
- case X86ISD::SHRUNKBLEND:
- return PerformSELECTCombine(N, DAG, DCI, Subtarget);
- case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget);
- case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
- case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
- case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
- case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
- case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
+ case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
+ case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
+ case ISD::ADD: return combineAdd(N, DAG, Subtarget);
+ case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::ADC: return combineADC(N, DAG, DCI);
+ case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL:
case ISD::SRA:
- case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
- case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
- case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
- case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
- case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
- case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
- case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
- case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
- case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
- case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
- case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget);
- case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);
+ case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
+ case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
+ case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, Subtarget);
+ case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
+ case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::FADD:
+ case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
+ case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
+ case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
case X86ISD::FXOR:
- case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
+ case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
case X86ISD::FMIN:
- case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
+ case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
- case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG,
- Subtarget);
- case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget);
- case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget);
- case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
- case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
+ case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
+ case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
+ case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
+ case X86ISD::BT: return combineBT(N, DAG, DCI);
+ case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG);
case ISD::ANY_EXTEND:
- case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
- case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
- case ISD::SIGN_EXTEND_INREG:
- return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
- case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
- case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
- case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
- case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
+ case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
+ case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
+ case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
+ case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::INSERTPS:
case X86ISD::PALIGNR:
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
case X86ISD::BLENDI:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
@@ -28086,23 +31000,36 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMI:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIL2:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
- case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
- case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:
- case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG);
+ case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
+ case X86ISD::TESTM: return combineTestM(N, DAG);
+ case X86ISD::PCMPEQ:
+ case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
}
return SDValue();
}
-/// isTypeDesirableForOp - Return true if the target has native support for
-/// the specified value type and it is 'desirable' to use the type for the
-/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
-/// instruction encodings are longer and some i16 instructions are slow.
+/// Return true if the target has native support for the specified value type
+/// and it is 'desirable' to use the type for the given node type. e.g. On x86
+/// i16 is legal, but undesirable since i16 instruction encodings are longer and
+/// some i16 instructions are slow.
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (!isTypeLegal(VT))
return false;
@@ -28140,9 +31067,9 @@ bool X86TargetLowering::hasCopyImplyingStackAdjustment(
[](const MachineInstr &RI) { return RI.isCopy(); });
}
-/// IsDesirableToPromoteOp - This method query the target whether it is
-/// beneficial for dag combiner to promote the specified node. If true, it
-/// should return the desired promotion type by reference.
+/// This method query the target whether it is beneficial for dag combiner to
+/// promote the specified node. If true, it should return the desired promotion
+/// type by reference.
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
EVT VT = Op.getValueType();
if (VT != MVT::i16)
@@ -28152,23 +31079,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
bool Commute = false;
switch (Op.getOpcode()) {
default: break;
- case ISD::LOAD: {
- LoadSDNode *LD = cast<LoadSDNode>(Op);
- // If the non-extending load has a single use and it's not live out, then it
- // might be folded.
- if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
- Op.hasOneUse()*/) {
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI) {
- // The only case where we'd want to promote LOAD (rather then it being
- // promoted as an operand is when it's only use is liveout.
- if (UI->getOpcode() != ISD::CopyToReg)
- return false;
- }
- }
- Promote = true;
- break;
- }
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
@@ -28250,7 +31160,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
- std::string AsmStr = IA->getAsmString();
+ const std::string &AsmStr = IA->getAsmString();
IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
@@ -28323,8 +31233,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
return false;
}
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
+/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
@@ -28403,13 +31312,13 @@ TargetLowering::ConstraintWeight
weight = CW_SpecificReg;
break;
case 'y':
- if (type->isX86_MMXTy() && Subtarget->hasMMX())
+ if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
case 'x':
case 'Y':
- if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
- ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
weight = CW_Register;
break;
case 'I':
@@ -28471,25 +31380,25 @@ TargetLowering::ConstraintWeight
return weight;
}
-/// LowerXConstraint - try to replace an X constraint, which matches anything,
-/// with another that has more specific requirements based on the type of the
-/// corresponding operand.
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
- if (Subtarget->hasSSE2())
+ if (Subtarget.hasSSE2())
return "Y";
- if (Subtarget->hasSSE1())
+ if (Subtarget.hasSSE1())
return "x";
}
return TargetLowering::LowerXConstraint(ConstraintVT);
}
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector. If it is invalid, don't add anything to Ops.
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue>&Ops,
@@ -28532,7 +31441,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'L':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
- (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
Op.getValueType());
break;
@@ -28605,7 +31514,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// In any sort of PIC mode addresses need to be computed at runtime by
// adding in a register or some sort of table lookup. These can't
// be used as immediates.
- if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+ if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
return;
// If we are in non-pic codegen mode, we allow the address of a global (with
@@ -28639,8 +31548,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
// If we require an extra load to get this address, as in PIC mode, we
// can't accept it.
- if (isGlobalStubReference(
- Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
+ if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
return;
Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
@@ -28656,6 +31564,65 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
+/// Check if \p RC is a general purpose register class.
+/// I.e., GR* or one of their variant.
+static bool isGRClass(const TargetRegisterClass &RC) {
+ switch (RC.getID()) {
+ case X86::GR8RegClassID:
+ case X86::GR8_ABCD_LRegClassID:
+ case X86::GR8_ABCD_HRegClassID:
+ case X86::GR8_NOREXRegClassID:
+ case X86::GR16RegClassID:
+ case X86::GR16_ABCDRegClassID:
+ case X86::GR16_NOREXRegClassID:
+ case X86::GR32RegClassID:
+ case X86::GR32_ABCDRegClassID:
+ case X86::GR32_TCRegClassID:
+ case X86::GR32_NOREXRegClassID:
+ case X86::GR32_NOAXRegClassID:
+ case X86::GR32_NOSPRegClassID:
+ case X86::GR32_NOREX_NOSPRegClassID:
+ case X86::GR32_ADRegClassID:
+ case X86::GR64RegClassID:
+ case X86::GR64_ABCDRegClassID:
+ case X86::GR64_TCRegClassID:
+ case X86::GR64_TCW64RegClassID:
+ case X86::GR64_NOREXRegClassID:
+ case X86::GR64_NOSPRegClassID:
+ case X86::GR64_NOREX_NOSPRegClassID:
+ case X86::LOW32_ADDR_ACCESSRegClassID:
+ case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Check if \p RC is a vector register class.
+/// I.e., FR* / VR* or one of their variant.
+static bool isFRClass(const TargetRegisterClass &RC) {
+ switch (RC.getID()) {
+ case X86::FR32RegClassID:
+ case X86::FR32XRegClassID:
+ case X86::FR64RegClassID:
+ case X86::FR64XRegClassID:
+ case X86::FR128RegClassID:
+ case X86::VR64RegClassID:
+ case X86::VR128RegClassID:
+ case X86::VR128LRegClassID:
+ case X86::VR128HRegClassID:
+ case X86::VR128XRegClassID:
+ case X86::VR256RegClassID:
+ case X86::VR256LRegClassID:
+ case X86::VR256HRegClassID:
+ case X86::VR256XRegClassID:
+ case X86::VR512RegClassID:
+ return true;
+ default:
+ return false;
+ }
+}
+
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
@@ -28670,7 +31637,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
- if (Subtarget->is64Bit()) {
+ if (Subtarget.is64Bit()) {
if (VT == MVT::i32 || VT == MVT::f32)
return std::make_pair(0U, &X86::GR32RegClass);
if (VT == MVT::i16)
@@ -28698,7 +31665,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
- if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32RegClass);
return std::make_pair(0U, &X86::GR64RegClass);
case 'R': // LEGACY_REGS
@@ -28706,7 +31673,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
- if (VT == MVT::i32 || !Subtarget->is64Bit())
+ if (VT == MVT::i32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
return std::make_pair(0U, &X86::GR64_NOREXRegClass);
case 'f': // FP Stack registers.
@@ -28718,13 +31685,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::RFP64RegClass);
return std::make_pair(0U, &X86::RFP80RegClass);
case 'y': // MMX_REGS if MMX allowed.
- if (!Subtarget->hasMMX()) break;
+ if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'Y': // SSE_REGS if SSE2 allowed
- if (!Subtarget->hasSSE2()) break;
+ if (!Subtarget.hasSSE2()) break;
// FALL THROUGH.
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
- if (!Subtarget->hasSSE1()) break;
+ if (!Subtarget.hasSSE1()) break;
switch (VT.SimpleTy) {
default: break;
@@ -28817,8 +31784,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// return "eax". This should even work for things like getting 64bit integer
// registers when given an f64 type.
const TargetRegisterClass *Class = Res.second;
- if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
- Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
+ // The generic code will match the first register class that contains the
+ // given register. Thus, based on the ordering of the tablegened file,
+ // the "plain" GR classes might not come first.
+ // Therefore, use a helper method.
+ if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
if (Size == 1) Size = 8;
unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
@@ -28834,11 +31804,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Res.first = 0;
Res.second = nullptr;
}
- } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
- Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
- Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
- Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
- Class == &X86::VR512RegClass) {
+ } else if (isFRClass(*Class)) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
@@ -28907,7 +31873,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
}
void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
- if (!Subtarget->is64Bit())
+ if (!Subtarget.is64Bit())
return;
// Update IsSplitCSR in X86MachineFunctionInfo.
@@ -28919,12 +31885,12 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
void X86TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
- const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {