aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp4020
1 files changed, 2615 insertions, 1405 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 08fe2bad281e..7ff483063ec2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -53,6 +53,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
@@ -70,6 +71,13 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+ "x86-experimental-pref-loop-alignment", cl::init(4),
+ cl::desc("Sets the preferable loop alignment for experiments "
+ "(the last x86-experimental-pref-loop-alignment bits"
+ " of the loop header PC will be 0)."),
+ cl::Hidden);
+
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -427,7 +435,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ExternalSymbol , VT, Custom);
setOperationAction(ISD::BlockAddress , VT, Custom);
}
- // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+
+ // 64-bit shl, sra, srl (iff 32-bit x86)
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
@@ -782,6 +791,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
@@ -888,6 +898,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
@@ -922,6 +935,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
@@ -1065,6 +1086,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1126,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
@@ -1271,6 +1293,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
if (Subtarget.hasVLX()) {
+ setOperationAction(ISD::ABS, MVT::v4i64, Legal);
+ setOperationAction(ISD::ABS, MVT::v2i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
@@ -1357,16 +1381,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
- setOperationAction(ISD::ADD, MVT::v8i1, Expand);
- setOperationAction(ISD::ADD, MVT::v16i1, Expand);
- setOperationAction(ISD::SUB, MVT::v8i1, Expand);
- setOperationAction(ISD::SUB, MVT::v16i1, Expand);
- setOperationAction(ISD::MUL, MVT::v8i1, Expand);
- setOperationAction(ISD::MUL, MVT::v16i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v8i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i1, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
@@ -1441,7 +1466,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Legal);
@@ -1460,12 +1485,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
- setOperationAction(ISD::ADD, MVT::v32i1, Expand);
- setOperationAction(ISD::ADD, MVT::v64i1, Expand);
- setOperationAction(ISD::SUB, MVT::v32i1, Expand);
- setOperationAction(ISD::SUB, MVT::v64i1, Expand);
- setOperationAction(ISD::MUL, MVT::v32i1, Expand);
- setOperationAction(ISD::MUL, MVT::v64i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v32i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v64i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v64i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v32i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v64i1, Custom);
setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
@@ -1479,8 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
@@ -1546,6 +1571,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
@@ -1574,9 +1600,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
- setOperationAction(ISD::ADD, VT, Expand);
- setOperationAction(ISD::SUB, VT, Expand);
- setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::TRUNCATE, VT, Custom);
@@ -1671,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
@@ -1696,6 +1723,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -1712,7 +1741,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
- setPrefLoopAlignment(4); // 2^4 bytes.
+ // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+ setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
@@ -1933,6 +1963,34 @@ bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+ ArgListTy &Args) const {
+
+ // Only relabel X86-32 for C / Stdcall CCs.
+ if (Subtarget.is64Bit())
+ return;
+ if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+ return;
+ unsigned ParamRegs = 0;
+ if (auto *M = MF->getFunction()->getParent())
+ ParamRegs = M->getNumberRegisterParameters();
+
+ // Mark the first N int arguments as having reg
+ for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+ Type *T = Args[Idx].Ty;
+ if (T->isPointerTy() || T->isIntegerTy())
+ if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+ unsigned numRegs = 1;
+ if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+ numRegs = 2;
+ if (ParamRegs < numRegs)
+ return;
+ ParamRegs -= numRegs;
+ Args[Idx].IsInReg = true;
+ }
+ }
+}
+
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
@@ -2001,21 +2059,37 @@ unsigned X86TargetLowering::getAddressSpace() const {
return 256;
}
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
- // glibc has a special slot for the stack guard in tcbhead_t, use it instead
- // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
- if (!Subtarget.isTargetGlibc())
- return TargetLowering::getIRStackGuard(IRB);
-
- // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
- // %gs:0x14 on i386
- unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
- unsigned AddressSpace = getAddressSpace();
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+ return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+ (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+ unsigned Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
}
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+ // tcbhead_t; use it instead of the usual global variable (see
+ // sysdeps/{i386,x86_64}/nptl/tls.h)
+ if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+ if (Subtarget.isTargetFuchsia()) {
+ // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+ return SegmentOffset(IRB, 0x10, getAddressSpace());
+ } else {
+ // %fs:0x28, unless we're using a Kernel code model, in which case
+ // it's %gs:0x28. gs:0x14 on i386.
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+ return SegmentOffset(IRB, Offset, getAddressSpace());
+ }
+ }
+
+ return TargetLowering::getIRStackGuard(IRB);
+}
+
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -2027,13 +2101,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
auto *SecurityCheckCookie = cast<Function>(
M.getOrInsertFunction("__security_check_cookie",
Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext()), nullptr));
+ Type::getInt8PtrTy(M.getContext())));
SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
return;
}
- // glibc has a special slot for the stack guard.
- if (Subtarget.isTargetGlibc())
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+ if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
@@ -2056,21 +2130,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
- if (!Subtarget.isTargetAndroid())
- return TargetLowering::getSafeStackPointerLocation(IRB);
-
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- unsigned AddressSpace, Offset;
+ if (Subtarget.isTargetAndroid()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x24 on i386
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ return SegmentOffset(IRB, Offset, getAddressSpace());
+ }
- // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
- // %gs:0x24 on i386
- Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
- AddressSpace = getAddressSpace();
- return ConstantExpr::getIntToPtr(
- ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
- Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+ // Fuchsia is similar.
+ if (Subtarget.isTargetFuchsia()) {
+ // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+ return SegmentOffset(IRB, 0x18, getAddressSpace());
+ }
+
+ return TargetLowering::getSafeStackPointerLocation(IRB);
}
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2179,6 +2255,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
+
+ // Add the register to the CalleeSaveDisableRegs list.
+ if (CallConv == CallingConv::X86_RegCall)
+ MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
@@ -2253,6 +2334,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(2 == RegsToPass.size() &&
"Expecting two registers after Pass64BitArgInRegs");
+
+ // Add the second register to the CalleeSaveDisableRegs list.
+ if (CallConv == CallingConv::X86_RegCall)
+ MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
@@ -2309,6 +2394,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// RAX/EAX now acts like a return value.
RetOps.push_back(
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+ // Add the returned register to the CalleeSaveDisableRegs list.
+ if (CallConv == CallingConv::X86_RegCall)
+ MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
}
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -2444,7 +2533,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
// Convert the i32 type into v32i1 type
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
- // Concantenate the two values together
+ // Concatenate the two values together
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
}
@@ -2488,8 +2577,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
SDValue X86TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ uint32_t *RegMask) const {
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget.is64Bit();
@@ -2503,6 +2594,14 @@ SDValue X86TargetLowering::LowerCallResult(
CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
+ // In some calling conventions we need to remove the used registers
+ // from the register mask.
+ if (RegMask && CallConv == CallingConv::X86_RegCall) {
+ for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+ }
+
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
@@ -2669,6 +2768,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
// If value is passed by pointer we have address passed instead of the value
// itself. No need to extend if the mask value and location share the same
@@ -2686,13 +2786,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// taken by a return address.
int Offset = 0;
if (CallConv == CallingConv::X86_INTR) {
- const X86Subtarget& Subtarget =
- static_cast<const X86Subtarget&>(DAG.getSubtarget());
// X86 interrupts may take one or two arguments.
// On the stack there will be no return address as in regular call.
// Offset of last argument need to be set to -4/-8 bytes.
// Where offset of the first argument out of two, should be set to 0 bytes.
Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ if (Subtarget.is64Bit() && Ins.size() == 2) {
+ // The stack pointer needs to be realigned for 64 bit handlers with error
+ // code, so the argument offset changes by 8 bytes.
+ Offset += 8;
+ }
}
// FIXME: For now, all byval parameter objects are marked mutable. This can be
@@ -2707,30 +2810,71 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
if (CallConv == CallingConv::X86_INTR) {
MFI.setObjectOffset(FI, Offset);
}
- return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- } else {
- int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
- VA.getLocMemOffset(), isImmutable);
-
- // Set SExt or ZExt flag.
- if (VA.getLocInfo() == CCValAssign::ZExt) {
- MFI.setObjectZExt(FI, true);
- } else if (VA.getLocInfo() == CCValAssign::SExt) {
- MFI.setObjectSExt(FI, true);
+ return DAG.getFrameIndex(FI, PtrVT);
+ }
+
+ // This is an argument in memory. We might be able to perform copy elision.
+ if (Flags.isCopyElisionCandidate()) {
+ EVT ArgVT = Ins[i].ArgVT;
+ SDValue PartAddr;
+ if (Ins[i].PartOffset == 0) {
+ // If this is a one-part value or the first part of a multi-part value,
+ // create a stack object for the entire argument value type and return a
+ // load from our portion of it. This assumes that if the first part of an
+ // argument is in memory, the rest will also be in memory.
+ int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+ /*Immutable=*/false);
+ PartAddr = DAG.getFrameIndex(FI, PtrVT);
+ return DAG.getLoad(
+ ValVT, dl, Chain, PartAddr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ } else {
+ // This is not the first piece of an argument in memory. See if there is
+ // already a fixed stack object including this offset. If so, assume it
+ // was created by the PartOffset == 0 branch above and create a load from
+ // the appropriate offset into it.
+ int64_t PartBegin = VA.getLocMemOffset();
+ int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+ int FI = MFI.getObjectIndexBegin();
+ for (; MFI.isFixedObjectIndex(FI); ++FI) {
+ int64_t ObjBegin = MFI.getObjectOffset(FI);
+ int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+ if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+ break;
+ }
+ if (MFI.isFixedObjectIndex(FI)) {
+ SDValue Addr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+ DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+ return DAG.getLoad(
+ ValVT, dl, Chain, Addr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+ Ins[i].PartOffset));
+ }
}
+ }
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
+ int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), isImmutable);
- SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue Val = DAG.getLoad(
- ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
- return ExtendedInMem ?
- DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI.setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI.setObjectSExt(FI, true);
+ }
+
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI.setObjectOffset(FI, Offset);
}
+
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
+ : Val;
}
// FIXME: Get this from tablegen.
@@ -2781,12 +2925,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
+#ifndef NDEBUG
static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
});
}
+#endif
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -2836,8 +2982,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
// The next loop assumes that the locations are in the same order of the
// input arguments.
- if (!isSortedByValueNo(ArgLocs))
- llvm_unreachable("Argument Location list must be sorted before lowering");
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
SDValue ArgValue;
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -2853,7 +2999,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
"Currently the only custom case is when we split v64i1 to 2 regs");
// v64i1 values, in regcall calling convention, that are
- // compiled to 32 bit arch, are splited up into two registers.
+ // compiled to 32 bit arch, are split up into two registers.
ArgValue =
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
@@ -3107,8 +3253,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
- // X86 interrupts must pop the error code if present
- FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+ // X86 interrupts must pop the error code (and the alignment padding) if
+ // present.
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
@@ -3146,6 +3293,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
}
+ if (CallConv == CallingConv::X86_RegCall) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
+ MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+ }
+
return Chain;
}
@@ -3348,8 +3501,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// The next loop assumes that the locations are in the same order of the
// input arguments.
- if (!isSortedByValueNo(ArgLocs))
- llvm_unreachable("Argument Location list must be sorted before lowering");
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
@@ -3517,7 +3670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert((CallConv == CallingConv::X86_RegCall) &&
- "Expecting custome case only in regcall calling convention");
+ "Expecting custom case only in regcall calling convention");
// This means that we are in special case where one argument was
// passed through two register locations - Skip the next location
++I;
@@ -3662,7 +3815,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Mask = RegInfo->getNoPreservedMask();
}
- Ops.push_back(DAG.getRegisterMask(Mask));
+ // Define a new register mask from the existing mask.
+ uint32_t *RegMask = nullptr;
+
+ // In some calling conventions we need to remove the used physical registers
+ // from the reg mask.
+ if (CallConv == CallingConv::X86_RegCall) {
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // Allocate a new Reg Mask and copy Mask.
+ RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
+ unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+ memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+
+ // Make sure all sub registers of the argument registers are reset
+ // in the RegMask.
+ for (auto const &RegPair : RegsToPass)
+ for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+ // Create the RegMask Operand according to our updated mask.
+ Ops.push_back(DAG.getRegisterMask(RegMask));
+ } else {
+ // Create the RegMask Operand according to the static mask.
+ Ops.push_back(DAG.getRegisterMask(Mask));
+ }
if (InFlag.getNode())
Ops.push_back(InFlag);
@@ -3715,8 +3893,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Handle result values, copying them out of physregs into vregs that we
// return.
- return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
- Ins, dl, DAG, InVals);
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+ InVals, RegMask);
}
//===----------------------------------------------------------------------===//
@@ -4132,6 +4310,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
return true;
// 'Faux' Target Shuffles.
case ISD::AND:
+ case X86ISD::ANDNP:
return true;
}
}
@@ -4448,6 +4627,11 @@ bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
}
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ return true;
+}
+
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
if (!Subtarget.hasBMI())
return false;
@@ -4460,6 +4644,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
return true;
}
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+ MVT VT = MVT::getIntegerVT(NumBits);
+ if (isTypeLegal(VT))
+ return VT;
+
+ // PMOVMSKB can handle this.
+ if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+ return MVT::v16i8;
+
+ // VPMOVMSKB can handle this.
+ if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+ return MVT::v32i8;
+
+ // TODO: Allow 64-bit type for 32-bit target.
+ // TODO: 512-bit types should be allowed, but make sure that those
+ // cases are handled in combineVectorSizedSetCCEquality().
+
+ return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
/// Val is the undef sentinel value or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
@@ -4555,28 +4759,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ int M0 = Mask[i];
+ int M1 = Mask[i + 1];
+
// If both elements are undef, its trivial.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
WidenedMask[i / 2] = SM_SentinelUndef;
continue;
}
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
- Mask[i + 1] % 2 == 1) {
- WidenedMask[i / 2] = Mask[i + 1] / 2;
+ if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+ WidenedMask[i / 2] = M1 / 2;
continue;
}
- if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
- WidenedMask[i / 2] = Mask[i] / 2;
+ if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+ WidenedMask[i / 2] = M0 / 2;
continue;
}
// When zeroing, we need to spread the zeroing across both lanes to widen.
- if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
- if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
- (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+ if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+ (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
WidenedMask[i / 2] = SM_SentinelZero;
continue;
}
@@ -4585,9 +4791,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
// Finally check if the two mask values are adjacent and aligned with
// a pair.
- if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
- Mask[i] + 1 == Mask[i + 1]) {
- WidenedMask[i / 2] = Mask[i] / 2;
+ if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+ WidenedMask[i / 2] = M0 / 2;
continue;
}
@@ -4770,9 +4975,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
return ConstsNode;
}
-static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
- assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+ assert(Bits.size() == Undefs.getBitWidth() &&
+ "Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
@@ -4844,10 +5050,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
- // Extract from UNDEF is UNDEF.
- if (Vec.isUndef())
- return DAG.getUNDEF(ResultVT);
-
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
@@ -4918,50 +5120,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
-
- // For insertion into the zero index (low half) of a 256-bit vector, it is
- // more efficient to generate a blend with immediate instead of an insert*128.
- // We are still creating an INSERT_SUBVECTOR below with an undef node to
- // extend the subvector to the size of the result vector. Make sure that
- // we are not recursing on that node by checking for undef here.
- if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
- !Result.isUndef()) {
- EVT ResultVT = Result.getValueType();
- SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
- SDValue Undef = DAG.getUNDEF(ResultVT);
- SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
- Vec, ZeroIndex);
-
- // The blend instruction, and therefore its mask, depend on the data type.
- MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
- if (ScalarType.isFloatingPoint()) {
- // Choose either vblendps (float) or vblendpd (double).
- unsigned ScalarSize = ScalarType.getSizeInBits();
- assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
- unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
- SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
- return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
- }
-
- const X86Subtarget &Subtarget =
- static_cast<const X86Subtarget &>(DAG.getSubtarget());
-
- // AVX2 is needed for 256-bit integer blend support.
- // Integers must be cast to 32-bit because there is only vpblendd;
- // vpblendw can't be used for this because it has a handicapped mask.
-
- // If we don't have AVX2, then cast to float. Using a wrong domain blend
- // is still more efficient than using the wrong domain vinsertf128 that
- // will be created by InsertSubVector().
- MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
-
- SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
- Result = DAG.getBitcast(CastVT, Result);
- Vec256 = DAG.getBitcast(CastVT, Vec256);
- Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
- return DAG.getBitcast(ResultVT, Vec256);
- }
-
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
@@ -5023,7 +5181,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (Vec.isUndef()) {
if (IdxVal != 0) {
SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+ WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+ ShiftBits);
}
return ExtractSubVec(WideSubVec);
}
@@ -5032,9 +5191,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
return ExtractSubVec(Vec);
}
@@ -5043,8 +5202,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
@@ -5056,12 +5215,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
// Zero upper bits of the Vec
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
return ExtractSubVec(Vec);
}
@@ -5094,26 +5253,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
}
/// Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
/// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
- SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Expected a 128/256/512-bit vector type");
APInt Ones = APInt::getAllOnesValue(32);
unsigned NumElts = VT.getSizeInBits() / 32;
- SDValue Vec;
- if (!Subtarget.hasInt256() && NumElts == 8) {
- Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
- Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
- } else {
- Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
- }
+ SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
return DAG.getBitcast(VT, Vec);
}
+static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+ SelectionDAG &DAG) {
+ EVT InVT = In.getValueType();
+ assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
+
+ if (VT.is128BitVector() && InVT.is128BitVector())
+ return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
+ : DAG.getZeroExtendVectorInReg(In, DL, VT);
+
+ // For 256-bit vectors, we only need the lower (128-bit) input half.
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
+ int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+ In = extractSubVector(In, 0, DAG, DL,
+ std::max(128, (int)VT.getSizeInBits() / Scale));
+ }
+
+ return DAG.getNode(Opc, DL, VT, In);
+}
+
/// Generate unpacklo/unpackhi shuffle mask.
static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
bool Unary) {
@@ -5199,9 +5370,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
- SmallBitVector &UndefElts,
- SmallVectorImpl<APInt> &EltBits) {
- assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+ APInt &UndefElts,
+ SmallVectorImpl<APInt> &EltBits,
+ bool AllowWholeUndefs = true,
+ bool AllowPartialUndefs = true) {
assert(EltBits.empty() && "Expected an empty EltBits vector");
Op = peekThroughBitcasts(Op);
@@ -5211,56 +5383,83 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
// Extract all the undef/constant element data and pack into single bitsets.
APInt UndefBits(SizeInBits, 0);
APInt MaskBits(SizeInBits, 0);
// Split the undef/constant single bitset data into the target elements.
auto SplitBitData = [&]() {
- UndefElts = SmallBitVector(NumElts, false);
+ // Don't split if we don't allow undef bits.
+ bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+ if (UndefBits.getBoolValue() && !AllowUndefs)
+ return false;
+
+ UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
for (unsigned i = 0; i != NumElts; ++i) {
- APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
- UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+ unsigned BitOffset = i * EltSizeInBits;
+ APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
- // Only treat an element as UNDEF if all bits are UNDEF, otherwise
- // treat it as zero.
+ // Only treat an element as UNDEF if all bits are UNDEF.
if (UndefEltBits.isAllOnesValue()) {
- UndefElts[i] = true;
+ if (!AllowWholeUndefs)
+ return false;
+ UndefElts.setBit(i);
continue;
}
- APInt Bits = MaskBits.lshr(i * EltSizeInBits);
- Bits = Bits.zextOrTrunc(EltSizeInBits);
+ // If only some bits are UNDEF then treat them as zero (or bail if not
+ // supported).
+ if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+ return false;
+
+ APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
EltBits[i] = Bits.getZExtValue();
}
return true;
};
- auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
- APInt &Undefs) {
+ // Collect constant bits and insert into mask/undef bit masks.
+ auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+ unsigned BitOffset) {
if (!Cst)
return false;
unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
if (isa<UndefValue>(Cst)) {
- Mask = APInt::getNullValue(SizeInBits);
- Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+ Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
return true;
}
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
- Mask = CInt->getValue().zextOrTrunc(SizeInBits);
- Undefs = APInt::getNullValue(SizeInBits);
+ Mask.insertBits(CInt->getValue(), BitOffset);
return true;
}
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
- Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
- Undefs = APInt::getNullValue(SizeInBits);
+ Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
return true;
}
return false;
};
+ // Extract constant bits from build vector.
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ const SDValue &Src = Op.getOperand(i);
+ unsigned BitOffset = i * SrcEltSizeInBits;
+ if (Src.isUndef()) {
+ UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+ continue;
+ }
+ auto *Cst = cast<ConstantSDNode>(Src);
+ APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+ MaskBits.insertBits(Bits, BitOffset);
+ }
+ return SplitBitData();
+ }
+
// Extract constant bits from constant pool vector.
if (auto *Cst = getTargetConstantFromNode(Op)) {
Type *CstTy = Cst->getType();
@@ -5268,117 +5467,59 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
- for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
- APInt Bits, Undefs;
- if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+ for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
+ if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
+ i * CstEltSizeInBits))
return false;
- MaskBits |= Bits.shl(i * CstEltSizeInBits);
- UndefBits |= Undefs.shl(i * CstEltSizeInBits);
- }
return SplitBitData();
}
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST &&
- EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+ EltSizeInBits <= SrcEltSizeInBits) {
if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
- APInt Bits, Undefs;
- if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
- unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
- unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
- for (unsigned i = 0; i != NumBroadcastElts; ++i) {
- MaskBits |= Bits.shl(i * NumBroadcastBits);
- UndefBits |= Undefs.shl(i * NumBroadcastBits);
+ APInt Bits(SizeInBits, 0);
+ APInt Undefs(SizeInBits, 0);
+ if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
+ for (unsigned i = 0; i != NumSrcElts; ++i) {
+ MaskBits |= Bits.shl(i * SrcEltSizeInBits);
+ UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
}
return SplitBitData();
}
}
}
+ // Extract a rematerialized scalar constant insertion.
+ if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+ Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+ auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+ MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+ MaskBits = MaskBits.zext(SizeInBits);
+ return SplitBitData();
+ }
+
return false;
}
-// TODO: Merge more of this with getTargetConstantBitsFromNode.
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask) {
- MaskNode = peekThroughBitcasts(MaskNode);
-
- MVT VT = MaskNode.getSimpleValueType();
- assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
- unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
-
- // Split an APInt element into MaskEltSizeInBits sized pieces and
- // insert into the shuffle mask.
- auto SplitElementToMask = [&](APInt Element) {
- // Note that this is x86 and so always little endian: the low byte is
- // the first byte of the mask.
- int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
- for (int i = 0; i < Split; ++i) {
- APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
- Element = Element.lshr(MaskEltSizeInBits);
- RawMask.push_back(RawElt.getZExtValue());
- }
- };
-
- if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
- // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
- // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
- if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
- return false;
- if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
- const APInt &MaskElement = CN->getAPIntValue();
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
- RawMask.push_back(RawElt.getZExtValue());
- }
- }
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+
+ // Extract the raw target constant bits.
+ // FIXME: We currently don't support UNDEF bits or mask entries.
+ if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+ EltBits, /* AllowWholeUndefs */ false,
+ /* AllowPartialUndefs */ false))
return false;
- }
- if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
- MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
- SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
- if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
- if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
- RawMask.push_back(CN->getZExtValue());
- RawMask.append(NumMaskElts - 1, 0);
- return true;
- }
-
- if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
- SplitElementToMask(CN->getAPIntValue());
- RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
- return true;
- }
- }
- return false;
- }
-
- if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
- return false;
-
- // We can always decode if the buildvector is all zero constants,
- // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
- if (all_of(MaskNode->ops(), X86::isZeroNode)) {
- RawMask.append(NumMaskElts, 0);
- return true;
- }
-
- // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
- if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
- return false;
-
- for (SDValue Op : MaskNode->ops()) {
- if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
- SplitElementToMask(CN->getAPIntValue());
- else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
- SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
- else
- return false;
- }
+ // Insert the extracted elements into the mask.
+ for (APInt Elt : EltBits)
+ RawMask.push_back(Elt.getZExtValue());
return true;
}
@@ -5405,6 +5546,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::BLENDI:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5473,8 +5615,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
IsUnary = true;
break;
case X86ISD::VBROADCAST: {
- // We only decode broadcasts of same-sized vectors at the moment.
- if (N->getOperand(0).getValueType() == VT) {
+ SDValue N0 = N->getOperand(0);
+ // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+ // add the pre-extracted value to the Ops vector.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == VT &&
+ N0.getConstantOperandVal(1) == 0)
+ Ops.push_back(N0.getOperand(0));
+
+ // We only decode broadcasts of same-sized vectors, unless the broadcast
+ // came from an extract from the original width. If we found one, we
+ // pushed it the Ops vector above.
+ if (N0.getValueType() == VT || !Ops.empty()) {
DecodeVectorBroadcast(VT, Mask);
IsUnary = true;
break;
@@ -5669,6 +5821,19 @@ static bool setTargetShuffleZeroElements(SDValue N,
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
+ assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+ "Illegal split of shuffle value type");
+ unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+
+ // Extract known constant input data.
+ APInt UndefSrcElts[2];
+ SmallVector<APInt, 32> SrcEltBits[2];
+ bool IsSrcConstant[2] = {
+ getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+ SrcEltBits[0], true, false),
+ getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+ SrcEltBits[1], true, false)};
+
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
@@ -5677,6 +5842,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
continue;
// Determine shuffle input and normalize the mask.
+ unsigned SrcIdx = M / Size;
SDValue V = M < Size ? V1 : V2;
M %= Size;
@@ -5686,39 +5852,27 @@ static bool setTargetShuffleZeroElements(SDValue N,
continue;
}
- // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
- if (V.getOpcode() != ISD::BUILD_VECTOR)
- continue;
-
- // If the BUILD_VECTOR has fewer elements then the (larger) source
- // element must be UNDEF/ZERO.
- // TODO: Is it worth testing the individual bits of a constant?
- if ((Size % V.getNumOperands()) == 0) {
- int Scale = Size / V->getNumOperands();
- SDValue Op = V.getOperand(M / Scale);
- if (Op.isUndef())
+ // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+ // TODO: We currently only set UNDEF for integer types - floats use the same
+ // registers as vectors and many of the scalar folded loads rely on the
+ // SCALAR_TO_VECTOR pattern.
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Size % V.getValueType().getVectorNumElements()) == 0) {
+ int Scale = Size / V.getValueType().getVectorNumElements();
+ int Idx = M / Scale;
+ if (Idx != 0 && !VT.isFloatingPoint())
Mask[i] = SM_SentinelUndef;
- else if (X86::isZeroNode(Op))
+ else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
Mask[i] = SM_SentinelZero;
continue;
}
- // If the BUILD_VECTOR has more elements then all the (smaller) source
- // elements must be all UNDEF or all ZERO.
- if ((V.getNumOperands() % Size) == 0) {
- int Scale = V->getNumOperands() / Size;
- bool AllUndef = true;
- bool AllZero = true;
- for (int j = 0; j < Scale; ++j) {
- SDValue Op = V.getOperand((M * Scale) + j);
- AllUndef &= Op.isUndef();
- AllZero &= X86::isZeroNode(Op);
- }
- if (AllUndef)
+ // Attempt to extract from the source's constant bits.
+ if (IsSrcConstant[SrcIdx]) {
+ if (UndefSrcElts[SrcIdx][M])
Mask[i] = SM_SentinelUndef;
- else if (AllZero)
+ else if (SrcEltBits[SrcIdx][M] == 0)
Mask[i] = SM_SentinelZero;
- continue;
}
}
@@ -5744,11 +5898,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned Opcode = N.getOpcode();
switch (Opcode) {
- case ISD::AND: {
+ case ISD::AND:
+ case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
- SmallBitVector UndefElts;
+ APInt UndefElts;
SmallVector<APInt, 32> EltBits;
- if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ bool IsAndN = (X86ISD::ANDNP == Opcode);
+ uint64_t ZeroMask = IsAndN ? 255 : 0;
+ if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
if (UndefElts[i]) {
@@ -5758,9 +5917,55 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
uint64_t ByteBits = EltBits[i].getZExtValue();
if (ByteBits != 0 && ByteBits != 255)
return false;
- Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+ Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
}
- Ops.push_back(N.getOperand(0));
+ Ops.push_back(IsAndN ? N1 : N0);
+ return true;
+ }
+ case ISD::SCALAR_TO_VECTOR: {
+ // Match against a scalar_to_vector of an extract from a similar vector.
+ SDValue N0 = N.getOperand(0);
+ if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ N0.getOperand(0).getValueType() != VT ||
+ !isa<ConstantSDNode>(N0.getOperand(1)) ||
+ NumElts <= N0.getConstantOperandVal(1) ||
+ !N->isOnlyUserOf(N0.getNode()))
+ return false;
+ Ops.push_back(N0.getOperand(0));
+ Mask.push_back(N0.getConstantOperandVal(1));
+ Mask.append(NumElts - 1, SM_SentinelUndef);
+ return true;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue InVec = N.getOperand(0);
+ SDValue InScl = N.getOperand(1);
+ uint64_t InIdx = N.getConstantOperandVal(2);
+ assert(InIdx < NumElts && "Illegal insertion index");
+
+ // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
+ if (X86::isZeroNode(InScl)) {
+ Ops.push_back(InVec);
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
+ return true;
+ }
+
+ // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+ // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
+ unsigned ExOp =
+ (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
+ if (InScl.getOpcode() != ISD::AssertZext ||
+ InScl.getOperand(0).getOpcode() != ExOp)
+ return false;
+
+ SDValue ExVec = InScl.getOperand(0).getOperand(0);
+ uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
+ assert(ExIdx < NumElts && "Illegal extraction index");
+ Ops.push_back(InVec);
+ Ops.push_back(ExVec);
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
return true;
}
case X86ISD::VSHLI:
@@ -5795,6 +6000,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
}
return true;
}
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case X86ISD::VZEXT: {
// TODO - add support for VPMOVZX with smaller input vector types.
SDValue Src = N.getOperand(0);
@@ -5810,36 +6016,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
}
+/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask) {
+ int MaskWidth = Mask.size();
+ SmallVector<SDValue, 16> UsedInputs;
+ for (int i = 0, e = Inputs.size(); i < e; ++i) {
+ int lo = UsedInputs.size() * MaskWidth;
+ int hi = lo + MaskWidth;
+ if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ UsedInputs.push_back(Inputs[i]);
+ continue;
+ }
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
+ }
+ Inputs = UsedInputs;
+}
+
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
-/// Op0/Op1 inputs accordingly.
+/// inputs accordingly.
/// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+static bool resolveTargetShuffleInputs(SDValue Op,
+ SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
- SmallVector<SDValue, 2> Ops;
- if (!setTargetShuffleZeroElements(Op, Mask, Ops))
- if (!getFauxShuffleMask(Op, Mask, Ops))
+ if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
+ if (!getFauxShuffleMask(Op, Mask, Inputs))
return false;
- int NumElts = Mask.size();
- bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
- return 0 <= Idx && Idx < NumElts;
- });
- bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
-
- Op0 = Op0InUse ? Ops[0] : SDValue();
- Op1 = Op1InUse ? Ops[1] : SDValue();
-
- // We're only using Op1 - commute the mask and inputs.
- if (!Op0InUse && Op1InUse) {
- for (int &M : Mask)
- if (NumElts <= M)
- M -= NumElts;
- Op0 = Op1;
- Op1 = SDValue();
- }
-
+ resolveTargetShuffleInputsAndMask(Inputs, Mask);
return true;
}
@@ -5914,10 +6122,9 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
- unsigned NumNonZero, unsigned NumZero,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (NumNonZero > 8)
return SDValue();
@@ -5928,18 +6135,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41()) {
for (unsigned i = 0; i < 16; ++i) {
- bool isNonZero = (NonZeros & (1 << i)) != 0;
- if (isNonZero) {
+ bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v16i8);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v16i8, V);
+ continue;
+ }
}
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
- MVT::v16i8, V, Op.getOperand(i),
- DAG.getIntPtrConstant(i, dl));
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
}
@@ -5958,24 +6173,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
if ((i & 1) != 0) {
+ // FIXME: Investigate extending to i32 instead of just i16.
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
if (LastIsNonZero) {
- LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
- MVT::i16, Op.getOperand(i-1));
+ LastElt =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
}
if (ThisIsNonZero) {
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
- ThisElt, DAG.getConstant(8, dl, MVT::i8));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
+ DAG.getConstant(8, dl, MVT::i8));
if (LastIsNonZero)
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
} else
ThisElt = LastElt;
- if (ThisElt.getNode())
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i/2, dl));
+ if (ThisElt) {
+ if (1 == i) {
+ V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+ : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ } else {
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i / 2, dl));
+ }
+ }
}
}
@@ -5986,8 +6212,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
+ const X86Subtarget &Subtarget) {
if (NumNonZero > 4)
return SDValue();
@@ -5995,18 +6220,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
SDValue V;
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
- bool isNonZero = (NonZeros & (1 << i)) != 0;
- if (isNonZero) {
+ bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v8i16);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
+ }
}
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
- MVT::v8i16, V, Op.getOperand(i),
- DAG.getIntPtrConstant(i, dl));
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
}
@@ -6015,8 +6248,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
+ const X86Subtarget &Subtarget) {
// Find all zeroable elements.
std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
@@ -6212,7 +6444,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
///
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
- SDLoc &DL, SelectionDAG &DAG,
+ const SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
unsigned NumElems = Elts.size();
@@ -6376,14 +6608,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
return SDValue();
}
-static Constant *getConstantVector(MVT VT, APInt SplatValue,
+static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
unsigned NumElm = SplatBitSize / ScalarSize;
SmallVector<Constant *, 32> ConstantVec;
for (unsigned i = 0; i < NumElm; i++) {
- APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+ APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
Constant *Const;
if (VT.isFloatingPoint()) {
assert((ScalarSize == 32 || ScalarSize == 64) &&
@@ -6664,6 +6896,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
// Quit if non-constant index.
if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
@@ -6694,11 +6927,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
- for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
- unsigned Idx = InsertIndices[i];
+
+ for (unsigned Idx : InsertIndices)
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
DAG.getIntPtrConstant(Idx, DL));
- }
return NV;
}
@@ -7347,7 +7579,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
(VT == MVT::v8i32 && Subtarget.hasInt256()))
return Op;
- return getOnesVector(VT, Subtarget, DAG, DL);
+ return getOnesVector(VT, DAG, DL);
}
return SDValue();
@@ -7418,7 +7650,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// a constant pool load than it is to do a movd + shuffle.
if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
(!IsAllConstants || Idx == 0)) {
- if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
MVT VecVT = MVT::v4i32;
@@ -7561,17 +7793,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
- DAG, Subtarget, *this))
+ DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
- DAG, Subtarget, *this))
+ DAG, Subtarget))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4)
- if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
@@ -7767,7 +7999,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
if (V1.isUndef())
- V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
if (IsZeroV1)
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
@@ -7956,7 +8188,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
}
-}
+ }
return true;
}
@@ -7986,6 +8218,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
return true;
}
+// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
+// mask.
+static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
+ const APInt &Zeroable) {
+ int NumElts = Mask.size();
+ assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+
+ SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
+ TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+ }
+ return TargetMask;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return false;
+
+ SmallVector<int, 8> Unpcklwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+ /* Unary = */ false);
+ SmallVector<int, 8> Unpckhwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+ /* Unary = */ false);
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
+ isTargetShuffleEquivalent(Mask, Unpckhwd));
+ return IsUnpackwdMask;
+}
+
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -8009,7 +8276,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
return Imm;
}
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
@@ -8022,9 +8289,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
/// as many lanes with this technique as possible to simplify the remaining
/// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
- SDValue V1, SDValue V2) {
- SmallBitVector Zeroable(Mask.size(), false);
+static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ APInt Zeroable(Mask.size(), 0);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
@@ -8039,7 +8306,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
int M = Mask[i];
// Handle the easy cases.
if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
- Zeroable[i] = true;
+ Zeroable.setBit(i);
continue;
}
@@ -8057,17 +8324,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
int Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef() || X86::isZeroNode(Op))
- Zeroable[i] = true;
+ Zeroable.setBit(i);
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt Val = Cst->getAPIntValue();
Val = Val.lshr((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
- Zeroable[i] = (Val == 0);
+ if (Val == 0)
+ Zeroable.setBit(i);
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt Val = Cst->getValueAPF().bitcastToAPInt();
Val = Val.lshr((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
- Zeroable[i] = (Val == 0);
+ if (Val == 0)
+ Zeroable.setBit(i);
}
continue;
}
@@ -8081,7 +8350,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue Op = V.getOperand((M * Scale) + j);
AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
}
- Zeroable[i] = AllZeroable;
+ if (AllZeroable)
+ Zeroable.setBit(i);
continue;
}
}
@@ -8096,19 +8366,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
//
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
- ArrayRef<int> Mask,const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
+ ArrayRef<int> Mask, const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
- for (int i = 0, e = Zeroable.size(); i < e; i++) {
+ for (int i = 0, e = Mask.size(); i < e; i++) {
// Checks if the mask's zeros elements are built from only zeros.
- if (Mask[i] == -1)
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] < 0)
return false;
if (Zeroable[i])
continue;
// Find the lowest non zero element
- if (NextElement == -1) {
+ if (NextElement < 0) {
NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
IsZeroSideLeft = NextElement != 0;
}
@@ -8124,7 +8395,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
@@ -8179,19 +8450,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl);
-// Function convertBitVectorToUnsigned - The function gets SmallBitVector
-// as argument and convert him to unsigned.
-// The output of the function is not(zeroable)
-static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
- unsigned convertBit = 0;
- for (int i = 0, e = Zeroable.size(); i < e; i++)
- convertBit |= !(Zeroable[i]) << i;
- return convertBit;
-}
-
// X86 has dedicated shuffle that can be lowered to VEXPAND
static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
ArrayRef<int> Mask, SDValue &V1,
SDValue &V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8199,7 +8460,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
return SDValue();
- unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+ unsigned VEXPANDMask = (~Zeroable).getZExtValue();
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
@@ -8215,6 +8476,91 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
ZeroVector);
}
+static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &UnpackOpcode, bool IsUnary,
+ ArrayRef<int> TargetMask, SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+
+ bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+ for (int i = 0; i != NumElts; i += 2) {
+ int M1 = TargetMask[i + 0];
+ int M2 = TargetMask[i + 1];
+ Undef1 &= (SM_SentinelUndef == M1);
+ Undef2 &= (SM_SentinelUndef == M2);
+ Zero1 &= isUndefOrZero(M1);
+ Zero2 &= isUndefOrZero(M2);
+ }
+ assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+ "Zeroable shuffle detected");
+
+ // Attempt to match the target mask against the unpack lo/hi mask patterns.
+ SmallVector<int, 64> Unpckl, Unpckh;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+ if (IsUnary && (Zero1 || Zero2)) {
+ // Don't bother if we can blend instead.
+ if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+ isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+ return false;
+
+ bool MatchLo = true, MatchHi = true;
+ for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+ int M = TargetMask[i];
+
+ // Ignore if the input is known to be zero or the index is undef.
+ if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+ (M == SM_SentinelUndef))
+ continue;
+
+ MatchLo &= (M == Unpckl[i]);
+ MatchHi &= (M == Unpckh[i]);
+ }
+
+ if (MatchLo || MatchHi) {
+ UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ return true;
+ }
+ }
+
+ // If a binary shuffle, commute and try again.
+ if (!IsUnary) {
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ std::swap(V1, V2);
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ std::swap(V1, V2);
+ return true;
+ }
+ }
+
+ return false;
+}
+
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -8248,13 +8594,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
/// one of the inputs being zeroable.
static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() && "Floating point types are not supported");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -8286,10 +8631,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
- int NumEltBits = EltVT.getSizeInBits();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
- EltVT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
@@ -8307,51 +8650,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
-/// \brief Try to emit a blend instruction for a shuffle.
-///
-/// This doesn't do any checks for the availability of instructions for blending
-/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
-/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Original,
- const SmallBitVector &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
- SmallVector<int, 8> Mask(Original.begin(), Original.end());
- bool ForceV1Zero = false, ForceV2Zero = false;
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
+static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
+ MutableArrayRef<int> TargetMask,
+ bool &ForceV1Zero, bool &ForceV2Zero,
+ uint64_t &BlendMask) {
+ bool V1IsZeroOrUndef =
+ V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZeroOrUndef =
+ V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+ BlendMask = 0;
+ ForceV1Zero = false, ForceV2Zero = false;
+ assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
- unsigned BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- int M = Mask[i];
- if (M < 0)
+ for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+ int M = TargetMask[i];
+ if (M == SM_SentinelUndef)
continue;
if (M == i)
continue;
if (M == i + Size) {
- BlendMask |= 1u << i;
+ BlendMask |= 1ull << i;
continue;
}
- if (Zeroable[i]) {
- if (V1IsZero) {
+ if (M == SM_SentinelZero) {
+ if (V1IsZeroOrUndef) {
ForceV1Zero = true;
- Mask[i] = i;
+ TargetMask[i] = i;
continue;
}
- if (V2IsZero) {
+ if (V2IsZeroOrUndef) {
ForceV2Zero = true;
- BlendMask |= 1u << i;
- Mask[i] = i + Size;
+ BlendMask |= 1ull << i;
+ TargetMask[i] = i + Size;
continue;
}
}
- return SDValue(); // Shuffled input!
+ return false;
}
+ return true;
+}
+
+uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+ uint64_t ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1ull << i))
+ ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+ return ScaledMask;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
+
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+ BlendMask))
+ return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
@@ -8359,15 +8732,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
- auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
- unsigned ScaledMask = 0;
- for (int i = 0; i != Size; ++i)
- if (BlendMask & (1u << i))
- for (int j = 0; j != Scale; ++j)
- ScaledMask |= 1u << (i * Scale + j);
- return ScaledMask;
- };
-
switch (VT.SimpleTy) {
case MVT::v2f64:
case MVT::v4f32:
@@ -8387,7 +8751,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (Subtarget.hasAVX2()) {
// Scale the blend by the number of 32-bit dwords per element.
int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
@@ -8400,7 +8764,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
int Scale = 8 / VT.getVectorNumElements();
- BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
V1 = DAG.getBitcast(MVT::v8i16, V1);
V2 = DAG.getBitcast(MVT::v8i16, V2);
return DAG.getBitcast(VT,
@@ -8417,7 +8781,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
BlendMask = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
- BlendMask |= 1u << i;
+ BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
}
@@ -8428,6 +8792,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
+ if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+ }
+
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked =
lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
@@ -8465,7 +8836,17 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
}
-
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8: {
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+ }
default:
llvm_unreachable("Not a supported integer vector type!");
}
@@ -8503,7 +8884,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
-/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// \brief Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
@@ -8757,7 +9138,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
unsigned ScalarSizeInBits,
ArrayRef<int> Mask, int MaskOffset,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -8819,7 +9200,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
@@ -8855,12 +9236,12 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SelectionDAG &DAG) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
- assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+ assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefInRange(Mask, HalfSize, HalfSize))
@@ -8987,7 +9368,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget. The extended elements are consecutive and
-/// begin and can start from an offseted element index in the input; to
+/// begin and can start from an offsetted element index in the input; to
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
@@ -9027,21 +9408,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget.hasSSE41()) {
- // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
if (Offset && Scale == 2 && VT.is128BitVector())
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
-
- // For 256-bit vectors, we only need the lower (128-bit) input half.
- // For 512-bit vectors, we only need the lower input half or quarter.
- if (VT.getSizeInBits() > 128)
- InputV = extractSubVector(InputV, 0, DAG, DL,
- std::max(128, (int)VT.getSizeInBits() / Scale));
-
- InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+ InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -9158,7 +9532,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
@@ -9314,7 +9688,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
@@ -9612,7 +9986,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
if (((BroadcastIdx * EltSize) % 128) != 0)
return SDValue();
- MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+ // The shuffle input might have been a bitcast we looked through; look at
+ // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
+ // later bitcast it to BroadcastVT.
+ MVT SrcVT = V.getSimpleValueType();
+ assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ "Unexpected vector element size");
+ assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+ "Unexpected vector size");
+
+ MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
DAG.getIntPtrConstant(BroadcastIdx, DL));
}
@@ -9642,6 +10025,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
}
+ // We only support broadcasting from 128-bit vectors to minimize the
+ // number of patterns we need to deal with in isel. So extract down to
+ // 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ V = extract128BitVector(V, 0, DAG, DL);
+
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
@@ -9653,7 +10042,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// elements are zeroable.
static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
unsigned &InsertPSMask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
@@ -9742,7 +10131,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
@@ -9877,7 +10266,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
/// it is better to avoid lowering through this for integer vectors where
/// possible.
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9959,7 +10348,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10178,7 +10567,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10261,7 +10650,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10353,7 +10742,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
- // up the inputs, bypassing domain shift penalties that we would encur if we
+ // up the inputs, bypassing domain shift penalties that we would incur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
@@ -10384,18 +10773,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
- assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+ assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
SmallVector<int, 4> LoInputs;
- std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
- [](int M) { return M >= 0; });
+ copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
std::sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
SmallVector<int, 4> HiInputs;
- std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
- [](int M) { return M >= 0; });
+ copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
std::sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
int NumLToL =
@@ -10574,7 +10961,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
- else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+ if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
// At this point there are at most two inputs to the low and high halves from
@@ -10830,7 +11217,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// blend if only one input is used.
static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+ const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
bool &V2InUse) {
SDValue V1Mask[16];
SDValue V2Mask[16];
@@ -10891,7 +11278,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11075,7 +11462,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11132,14 +11519,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (!canWidenViaDuplication(Mask))
return SDValue();
SmallVector<int, 4> LoInputs;
- std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
- [](int M) { return M >= 0 && M < 8; });
+ copy_if(Mask, std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
std::sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
LoInputs.end());
SmallVector<int, 4> HiInputs;
- std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
- [](int M) { return M >= 8; });
+ copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
std::sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
HiInputs.end());
@@ -11193,7 +11579,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
PostDupI16Shuffle[i / 2] = MappedMask;
else
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
- "Conflicting entrties in the original shuffle!");
+ "Conflicting entries in the original shuffle!");
}
return DAG.getBitcast(
MVT::v16i8,
@@ -11365,7 +11751,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
@@ -11621,7 +12007,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
/// \brief Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<int, 4> WidenedMask;
@@ -12091,7 +12477,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
unsigned &ShuffleImm,
ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
- assert(VT.getScalarType() == MVT::f64 &&
+ assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
@@ -12127,6 +12513,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
+ assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+ "Unexpected data type for VSHUFPD");
+
unsigned Immediate = 0;
if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
@@ -12153,7 +12542,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12250,7 +12639,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12338,7 +12727,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12414,6 +12803,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code using vpunpcklwd and
+ // vpunpckhwd instrs than vblend.
+ if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+ if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG))
+ return V;
+
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
@@ -12429,7 +12826,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12445,6 +12842,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code than vblend by using
+ // vpunpcklwd and vpunpckhwd instrs.
+ if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+ !Subtarget.hasAVX512())
+ if (SDValue V =
+ lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return V;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
@@ -12533,7 +12939,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12619,7 +13025,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12692,7 +13098,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
@@ -12844,7 +13250,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12891,12 +13297,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12925,6 +13335,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
return Unpck;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
@@ -12938,7 +13352,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12994,12 +13408,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13062,12 +13480,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13109,12 +13530,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13159,6 +13584,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -13170,7 +13599,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
@@ -13251,7 +13680,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ISD::isBuildVectorAllZeros(V1.getNode()))
V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
else if (ISD::isBuildVectorAllOnes(V1.getNode()))
- V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ V1 = getOnesVector(ExtVT, DAG, DL);
else
V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
@@ -13260,7 +13689,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
else if (ISD::isBuildVectorAllZeros(V2.getNode()))
V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
else if (ISD::isBuildVectorAllOnes(V2.getNode()))
- V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ V2 = getOnesVector(ExtVT, DAG, DL);
else
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
@@ -13392,8 +13821,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- if (Zeroable.all())
+ APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
// Try to collapse shuffles into using a vector type with fewer elements but
@@ -13569,10 +13998,14 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
- // extend vector to VR512
+ // extend vector to VR512/128
if (!isa<ConstantSDNode>(Idx)) {
- MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
- SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+ unsigned NumElts = VecVT.getVectorNumElements();
+ // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+ // than extending to 128/256bit.
+ unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
ExtVT.getVectorElementType(), Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
@@ -13590,9 +14023,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
}
unsigned MaxSift = VecVT.getVectorNumElements() - 1;
if (MaxSift - IdxVal)
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(MaxSift, dl, MVT::i8));
return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
DAG.getIntPtrConstant(0, dl));
@@ -13610,24 +14043,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return ExtractBitFromMaskVector(Op, DAG);
if (!isa<ConstantSDNode>(Idx)) {
- if (VecVT.is512BitVector() ||
- (VecVT.is256BitVector() && Subtarget.hasInt256() &&
- VecVT.getScalarSizeInBits() == 32)) {
-
- MVT MaskEltVT =
- MVT::getIntegerVT(VecVT.getScalarSizeInBits());
- MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
- MaskEltVT.getSizeInBits());
+ // Its more profitable to go through memory (1 cycles throughput)
+ // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+ // IACA tool was used to get performance estimation
+ // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+ //
+ // example : extractelement <16 x i8> %a, i32 %i
+ //
+ // Block Throughput: 3.00 Cycles
+ // Throughput Bottleneck: Port5
+ //
+ // | Num Of | Ports pressure in cycles | |
+ // | Uops | 0 - DV | 5 | 6 | 7 | |
+ // ---------------------------------------------
+ // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
+ // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
+ // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
+ // Total Num Of Uops: 4
+ //
+ //
+ // Block Throughput: 1.00 Cycles
+ // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+ //
+ // | | Ports pressure in cycles | |
+ // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
+ // ---------------------------------------------------------
+ // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+ // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
+ // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
+ // Total Num Of Uops: 4
- Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
- getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
- DAG.getConstant(0, dl, PtrVT));
- SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
- DAG.getConstant(0, dl, PtrVT));
- }
return SDValue();
}
@@ -13675,7 +14120,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
- // TODO: handle v16i8.
+ // TODO: We only extract a single element from v16i8, we can probably afford
+ // to be more aggressive here before using the default approach of spilling to
+ // stack.
+ if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+ // Extract either the lowest i32 or any i16, and extract the sub-byte.
+ int DWordIdx = IdxVal / 4;
+ if (DWordIdx == 0) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec),
+ DAG.getIntPtrConstant(DWordIdx, dl));
+ int ShiftVal = (IdxVal % 4) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ int WordIdx = IdxVal / 2;
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ DAG.getBitcast(MVT::v8i16, Vec),
+ DAG.getIntPtrConstant(WordIdx, dl));
+ int ShiftVal = (IdxVal % 2) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i16));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
@@ -13734,7 +14205,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
if(Vec.isUndef()) {
if (IdxVal)
- EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return EltInVec;
}
@@ -13744,21 +14215,21 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
if (IdxVal == 0 ) {
// EltInVec already at correct index and other bits are 0.
// Clean the first bit in source vector.
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(1 , dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(1, dl, MVT::i8));
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
if (IdxVal == NumElems -1) {
// Move the bit to the last position inside the vector.
- EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
// Clean the last bit in the source vector.
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(1, dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(1 , dl, MVT::i8));
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
@@ -13790,17 +14261,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
auto *N2C = cast<ConstantSDNode>(N2);
unsigned IdxVal = N2C->getZExtValue();
- // If we are clearing out a element, we do this more efficiently with a
- // blend shuffle than a costly integer insertion.
- // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+ bool IsZeroElt = X86::isZeroNode(N1);
+ bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+ // If we are inserting a element, see if we can do this more efficiently with
+ // a blend shuffle with a rematerializable vector than a costly integer
+ // insertion.
// TODO: pre-SSE41 targets will tend to use bit masking - this could still
// be beneficial if we are inserting several zeros and can combine the masks.
- if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
- SmallVector<int, 8> ClearMask;
+ if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+ SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
- ClearMask.push_back(i == IdxVal ? i + NumElts : i);
- SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+ BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+ : DAG.getConstant(-1, dl, VT);
+ return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
}
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -13837,25 +14312,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
- if (Subtarget.hasSSE41()) {
- if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
- unsigned Opc;
- if (VT == MVT::v8i16) {
- Opc = X86ISD::PINSRW;
- } else {
- assert(VT == MVT::v16i8);
- Opc = X86ISD::PINSRB;
- }
-
- // Transform it so it match pinsr{b,w} which expects a GR32 as its second
- // argument.
- if (N1.getValueType() != MVT::i32)
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal, dl);
- return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument. SSE41 required for pinsrb.
+ if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ unsigned Opc;
+ if (VT == MVT::v8i16) {
+ assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+ Opc = X86ISD::PINSRW;
+ } else {
+ assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+ assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+ Opc = X86ISD::PINSRB;
}
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (Subtarget.hasSSE41()) {
if (EltVT == MVT::f32) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
@@ -13885,36 +14362,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
}
- if (EltVT == MVT::i32 || EltVT == MVT::i64) {
- // PINSR* works with constant index.
+ // PINSR* works with constant index.
+ if (EltVT == MVT::i32 || EltVT == MVT::i64)
return Op;
- }
}
- if (EltVT == MVT::i8)
- return SDValue();
-
- if (EltVT.getSizeInBits() == 16) {
- // Transform it so it match pinsrw which expects a 16-bit value in a GR32
- // as its second argument.
- if (N1.getValueType() != MVT::i32)
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal, dl);
- return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
- }
return SDValue();
}
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SDLoc dl(Op);
MVT OpVT = Op.getSimpleValueType();
+ // It's always cheaper to replace a xor+movd with xorps and simplifies further
+ // combines.
+ if (X86::isZeroNode(Op.getOperand(0)))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
- unsigned SizeFactor = OpVT.getSizeInBits()/128;
+ unsigned SizeFactor = OpVT.getSizeInBits() / 128;
MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
@@ -13923,9 +14393,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+
+ // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+ if (OpVT == MVT::v4i32)
+ return Op;
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
return DAG.getBitcast(
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
@@ -13947,20 +14421,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
In.getSimpleValueType().is512BitVector()) &&
"Can only extract from 256-bit or 512-bit vectors");
- if (ResVT.is128BitVector())
- return extract128BitVector(In, IdxVal, DAG, dl);
- if (ResVT.is256BitVector())
- return extract256BitVector(In, IdxVal, DAG, dl);
-
- llvm_unreachable("Unimplemented!");
-}
+ // If the input is a buildvector just emit a smaller one.
+ unsigned ElemsPerChunk = ResVT.getVectorNumElements();
+ if (In.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT,
+ makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
-static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
- for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
- if (llvm::all_of(ValidUsers,
- [&I](SDValue V) { return V.getNode() != *I; }))
- return false;
- return true;
+ // Everything else is legal.
+ return Op;
}
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
@@ -13968,83 +14436,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
-
- SDLoc dl(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue SubVec = Op.getOperand(1);
- SDValue Idx = Op.getOperand(2);
-
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- MVT OpVT = Op.getSimpleValueType();
- MVT SubVecVT = SubVec.getSimpleValueType();
-
- if (OpVT.getVectorElementType() == MVT::i1)
- return insert1BitVector(Op, DAG, Subtarget);
-
- assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
- "Can only insert into 256-bit or 512-bit vectors");
+ assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
- // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
- // load:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr + 16), Elts/2)
- // --> load32 addr
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr + 32), Elts/2)
- // --> load64 addr
- // or a 16-byte or 32-byte broadcast:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr), Elts/2)
- // --> X86SubVBroadcast(load16 addr)
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr), Elts/2)
- // --> X86SubVBroadcast(load32 addr)
- if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
- Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
- auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
- if (Idx2 && Idx2->getZExtValue() == 0) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
- bool Fast;
- unsigned Alignment = FirstLd->getAlignment();
- unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
- return Ld;
- }
- }
- // If lower/upper loads are the same and the only users of the load, then
- // lower to a VBROADCASTF128/VBROADCASTI128/etc.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
- if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
- areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
- }
- }
- // If this is subv_broadcast insert into both halves, use a larger
- // subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
- SubVec.getOperand(0));
- }
- }
- }
-
- if (SubVecVT.is128BitVector())
- return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
- if (SubVecVT.is256BitVector())
- return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
- llvm_unreachable("Unimplemented!");
+ return insert1BitVector(Op, DAG, Subtarget);
}
// Returns the appropriate wrapper opcode for a global reference.
@@ -14062,7 +14456,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
@@ -14438,7 +14832,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
Subtarget.isTargetWindowsItanium() ||
Subtarget.isTargetWindowsGNU()) {
// Just use the implicit TLS architecture
- // Need to generate someting similar to:
+ // Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
// ; from TEB
// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
@@ -15489,32 +15883,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// word to byte only under BWI
if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
return DAG.getNode(X86ISD::VTRUNC, DL, VT,
- DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
- // Truncate with PACKSS if we are truncating a vector comparison result.
- // TODO: We should be able to support other operations as long as we
- // we are saturating+packing zero/all bits only.
- auto IsPackableComparison = [](SDValue V) {
- unsigned Opcode = V.getOpcode();
- return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
- Opcode == X86ISD::CMPP);
- };
-
- if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
- all_of(In->ops(), IsPackableComparison))) {
+ // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
+ if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
return V;
- }
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v8i32, In);
- In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
- ShufMask);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
DAG.getIntPtrConstant(0, DL));
}
@@ -15530,30 +15913,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
- // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
In = DAG.getBitcast(MVT::v32i8, In);
- SmallVector<SDValue,32> pshufbMask;
- for (unsigned i = 0; i < 2; ++i) {
- pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
- for (unsigned j = 0; j < 8; ++j)
- pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
- }
- SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
- In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ // The PSHUFB mask:
+ static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 16, 17, 20, 21, 24, 25, 28, 29,
+ -1, -1, -1, -1, -1, -1, -1, -1 };
+ In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
- static const int ShufMask[] = {0, 2, -1, -1};
- In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
- ShufMask);
+ static const int ShufMask2[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
return DAG.getBitcast(VT, In);
@@ -15572,9 +15945,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1};
- SDValue Undef = DAG.getUNDEF(MVT::v16i8);
- OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -15598,17 +15970,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// Prepare truncation shuffle mask
for (unsigned i = 0; i != NumElems; ++i)
MaskVec[i] = i * 2;
- SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
- DAG.getUNDEF(NVT), MaskVec);
+ In = DAG.getBitcast(NVT, In);
+ SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
DAG.getIntPtrConstant(0, DL));
}
-SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
-
MVT VT = Op.getSimpleValueType();
if (VT.isVector()) {
@@ -15616,8 +15985,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
SDValue Src = Op.getOperand(0);
SDLoc dl(Op);
if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
- return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
- dl, VT,
+ return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
}
@@ -15891,7 +16259,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
- // If more than one full vectors are evaluated, OR them first before PTEST.
+ // If more than one full vector is evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
// Each iteration will OR 2 nodes and append the result until there is only
// 1 node left, i.e. the final OR'd value of all vectors.
@@ -15900,8 +16268,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
- VecIns.back(), VecIns.back());
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// \brief return true if \c Op has a use that doesn't just read flags.
@@ -16366,7 +16733,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
}
/// If we have at least two divisions that use the same divisor, convert to
-/// multplication by a reciprocal. This may need to be adjusted for a given
+/// multiplication by a reciprocal. This may need to be adjusted for a given
/// CPU if a division's cost is not at least twice the cost of a multiplication.
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
@@ -17241,12 +17608,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+ // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
-
- unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+ unsigned CondCode =
+ cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
@@ -17283,6 +17652,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!isNullConstant(Op2))
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
+ } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+ Cmp.getOperand(0).getOpcode() == ISD::AND &&
+ isOneConstant(Cmp.getOperand(0).getOperand(1))) {
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ SDValue Src1, Src2;
+ // true if Op2 is XOR or OR operator and one of its operands
+ // is equal to Op1
+ // ( a , a op b) || ( b , a op b)
+ auto isOrXorPattern = [&]() {
+ if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
+ (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
+ Src1 =
+ Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
+ Src2 = Op1;
+ return true;
+ }
+ return false;
+ };
+
+ if (isOrXorPattern()) {
+ SDValue Neg;
+ unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
+ // we need mask of all zeros or ones with same size of the other
+ // operands.
+ if (CmpSz > VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
+ else if (CmpSz < VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
+ DAG.getConstant(1, DL, VT));
+ else
+ Neg = CmpOp0;
+ SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Neg); // -(and (x, 0x1))
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+ return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
+ }
}
}
@@ -17423,17 +17829,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
// SKX processor
if ((InVTElt == MVT::i1) &&
- (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
- VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
-
- ((Subtarget.hasBWI() && VT.is512BitVector() &&
- VTElt.getSizeInBits() <= 16)) ||
+ (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
- ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
- VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+ ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
- ((Subtarget.hasDQI() && VT.is512BitVector() &&
- VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
unsigned NumElts = VT.getVectorNumElements();
@@ -17441,8 +17840,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
if (VT.is512BitVector() && InVTElt != MVT::i1 &&
(NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
- return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
- return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
+ return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
}
if (InVTElt != MVT::i1)
@@ -17454,10 +17853,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
SDValue V;
if (Subtarget.hasDQI()) {
- V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+ V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
assert(!VT.is512BitVector() && "Unexpected vector type");
} else {
- SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+ SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
if (ExtVT == VT)
@@ -17506,11 +17905,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
- // SSE41 targets can use the pmovsx* instructions directly.
- unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
- X86ISD::VSEXT : X86ISD::VZEXT;
- if (Subtarget.hasSSE41())
+ // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
+ // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+ // need to be handled here for 256/512-bit results.
+ if (Subtarget.hasInt256()) {
+ assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+ unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+ X86ISD::VSEXT : X86ISD::VZEXT;
return DAG.getNode(ExtOpc, dl, VT, In);
+ }
// We should only get here for sign extend.
assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
@@ -17595,8 +17998,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
- OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
- OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+ OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+ OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
@@ -17674,7 +18077,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
MVT VT = Op.getValueType().getSimpleVT();
unsigned NumElts = VT.getVectorNumElements();
- if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ if ((Subtarget.hasBWI() && NumElts >= 32) ||
+ (Subtarget.hasDQI() && NumElts < 16) ||
NumElts == 16) {
// Load and extend - everything is legal
if (NumElts < 8) {
@@ -17703,7 +18107,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
if (NumElts <= 8) {
// A subset, assume that we have only AVX-512F
- unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+ unsigned NumBitsToLoad = 8;
MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
Ld->getBasePtr(),
@@ -17911,7 +18315,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
if (Ext == ISD::SEXTLOAD) {
// If we have SSE4.1, we can directly emit a VSEXT node.
if (Subtarget.hasSSE41()) {
- SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Sext;
}
@@ -18469,6 +18873,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
+ // Bitcast the source vector to the output type, this is mainly necessary for
+ // vXi8/vXi64 shifts.
+ if (VT != SrcOp.getSimpleValueType())
+ SrcOp = DAG.getBitcast(VT, SrcOp);
+
// Fold this packed shift into its first operand if ShiftAmt is 0.
if (ShiftAmt == 0)
return SrcOp;
@@ -18485,9 +18894,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
&& "Unknown target vector shift-by-constant node");
// Fold this packed vector shift into a build vector if SrcOp is a
- // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
- if (VT == SrcOp.getSimpleValueType() &&
- ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ // vector of Constants or UNDEFs.
+ if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
ConstantSDNode *ND;
@@ -18578,11 +18986,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
ShAmt = ShAmt.getOperand(0);
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
- ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
- ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
} else {
SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -18853,6 +19261,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, VT, Src1, Src2, Rnd),
+ Mask, passThru, Subtarget, DAG);
+ }
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
Mask, passThru, Subtarget, DAG);
}
@@ -19306,6 +19722,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
+ case MASK_BINOP: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
+ return DAG.getBitcast(VT, Res);
+ }
case FIXUPIMMS:
case FIXUPIMMS_MASKZ:
case FIXUPIMM:
@@ -19478,6 +19903,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ case Intrinsic::x86_avx512_knot_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
+ SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
+ case Intrinsic::x86_avx512_kandn_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ // Invert LHS for the not.
+ LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
+ DAG.getConstant(1, dl, MVT::v16i1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
+ case Intrinsic::x86_avx512_kxnor_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+ // Invert result for the not.
+ Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
+ DAG.getConstant(1, dl, MVT::v16i1));
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:
@@ -19603,6 +20055,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
}
}
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ EVT MaskVT = Mask.getValueType();
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
@@ -19617,7 +20091,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- if (Src.isUndef())
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -19656,7 +20133,6 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- //SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
@@ -19928,6 +20404,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
+ case GATHER_AVX2: {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
@@ -19953,8 +20439,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
- assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
- unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+ assert((HintVal == 2 || HintVal == 3) &&
+ "Wrong prefetch hint in intrinsic: should be 2 or 3");
+ unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
@@ -20368,7 +20855,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// Check that ECX wasn't needed by an 'inreg' parameter.
FunctionType *FTy = Func->getFunctionType();
- const AttributeSet &Attrs = Func->getAttributes();
+ const AttributeList &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
unsigned InRegCount = 0;
@@ -20802,9 +21289,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
}
-static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() == MVT::i1)
- return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
@@ -20812,14 +21300,23 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
return Lower256IntArith(Op, DAG);
}
-static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() == MVT::i1)
- return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
- Op.getOperand(0), Op.getOperand(1));
+static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return Lower256IntArith(Op, DAG);
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ SDLoc dl(Op);
+ SDValue Src = Op.getOperand(0);
+ SDValue Lo = extract128BitVector(Src, 0, DAG, dl);
+ SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(ISD::ABS, dl, NewVT, Lo),
+ DAG.getNode(ISD::ABS, dl, NewVT, Hi));
}
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
@@ -20834,7 +21331,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- if (VT == MVT::i1)
+ if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
// Decompose 256-bit ops into smaller 128-bit ops.
@@ -20874,8 +21371,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// Extract the lo parts and sign extend to i16
SDValue ALo, BLo;
if (Subtarget.hasSSE41()) {
- ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
- BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
+ BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
} else {
const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-1, 4, -1, 5, -1, 6, -1, 7};
@@ -20894,8 +21391,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
- BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
+ BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
} else {
const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
-1, 12, -1, 13, -1, 14, -1, 15};
@@ -21056,8 +21553,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
}
- SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
- SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+ SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
+ SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
DAG.getConstant(8, dl, MVT::v16i16));
@@ -21073,8 +21570,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Extract the lo parts and zero/sign extend to i16.
SDValue ALo, BLo;
if (Subtarget.hasSSE41()) {
- ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
- BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+ ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
+ BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
} else {
const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-1, 4, -1, 5, -1, 6, -1, 7};
@@ -21093,8 +21590,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
- BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+ AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
+ BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
} else {
const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
-1, 12, -1, 13, -1, 14, -1, 15};
@@ -21148,8 +21645,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
MachinePointerInfo(), /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
- Entry.isSExt = false;
- Entry.isZExt = false;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
Args.push_back(Entry);
}
@@ -21157,11 +21654,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
getPointerTy(DAG.getDataLayout()));
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(InChain)
- .setCallee(getLibcallCallingConv(LC),
- static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
- Callee, std::move(Args))
- .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+ CLI.setDebugLoc(dl)
+ .setChain(InChain)
+ .setLibCallee(
+ getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args))
+ .setInRegister()
+ .setSExtResult(isSigned)
+ .setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return DAG.getBitcast(VT, CallInfo.first);
@@ -21269,15 +21770,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
if (VT.getScalarSizeInBits() < 16)
return false;
- if (VT.is512BitVector() &&
+ if (VT.is512BitVector() && Subtarget.hasAVX512() &&
(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
return true;
- bool LShift = VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasInt256());
+ bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256());
- bool AShift = LShift && (Subtarget.hasVLX() ||
- (VT != MVT::v2i64 && VT != MVT::v4i64));
+ bool AShift = LShift && (Subtarget.hasAVX512() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
return (Opcode == ISD::SRA) ? AShift : LShift;
}
@@ -21301,7 +21802,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
return false;
- if (VT.is512BitVector() || Subtarget.hasVLX())
+ if (Subtarget.hasAVX512())
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -22062,10 +22563,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// A subtract of one will be selected as a INC. Note that INC doesn't
// set CF, so we can't do this for UADDO.
if (isOneConstant(RHS)) {
- BaseOp = X86ISD::INC;
- Cond = X86::COND_O;
- break;
- }
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
BaseOp = X86ISD::ADD;
Cond = X86::COND_O;
break;
@@ -22077,10 +22578,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// A subtract of one will be selected as a DEC. Note that DEC doesn't
// set CF, so we can't do this for USUBO.
if (isOneConstant(RHS)) {
- BaseOp = X86ISD::DEC;
- Cond = X86::COND_O;
- break;
- }
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
BaseOp = X86ISD::SUB;
Cond = X86::COND_O;
break;
@@ -22470,7 +22971,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
// index into a in-register pre-computed pop count table. We then split up the
// input vector in two new ones: (1) a vector with only the shifted-right
// higher nibbles for each byte and (2) a vector with the lower nibbles (and
- // masked out higher ones) for each byte. PSHUB is used separately with both
+ // masked out higher ones) for each byte. PSHUFB is used separately with both
// to index the in-register table. Next, both are added and the result is a
// i8 vector where each element contains the pop count for input byte.
//
@@ -22867,8 +23368,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
Entry.Node = Arg;
Entry.Ty = ArgTy;
- Entry.isSExt = false;
- Entry.isZExt = false;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
Args.push_back(Entry);
bool isF64 = ArgVT == MVT::f64;
@@ -22885,8 +23386,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
: (Type*)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -23086,7 +23588,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
// Mask element has to be i1.
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+ "We handle 4x32, 4x64 and 2x64 vectors only in this case");
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
@@ -23142,7 +23644,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
// Mask element has to be i1.
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+ "We handle 4x32, 4x64 and 2x64 vectors only in this case");
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
@@ -23202,7 +23704,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
- // The pass-thru value
+ // The pass-through value
MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
Src0 = ExtendToType(Src0, NewVT, DAG);
@@ -23284,7 +23786,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
- case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
@@ -23303,7 +23805,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
case ISD::FABS:
@@ -23360,12 +23862,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
- case ISD::ADD: return LowerADD(Op, DAG);
- case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::ADD:
+ case ISD::SUB: return LowerADD_SUB(Op, DAG);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::ABS: return LowerABS(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
@@ -23768,7 +24271,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
- case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
@@ -23779,16 +24281,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
- case X86ISD::ABS: return "X86ISD::ABS";
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMAXS: return "X86ISD::FMAXS";
case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMINS: return "X86ISD::FMINS";
case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
+ case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
+ case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
@@ -23827,7 +24332,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
- case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
@@ -23876,6 +24380,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TESTNM: return "X86ISD::TESTNM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::KTEST: return "X86ISD::KTEST";
+ case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
+ case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
case X86ISD::PACKSS: return "X86ISD::PACKSS";
case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
@@ -23976,9 +24482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
@@ -24302,7 +24812,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
- MIB.addOperand(Op);
+ MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24338,7 +24848,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
- MIB.addOperand(Op);
+ MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24398,7 +24908,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
unsigned ValOps = X86::AddrNumOperands;
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
@@ -24413,6 +24923,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
return BB;
}
+static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ // Address into RAX/EAX
+ unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.add(MI->getOperand(i));
+
+ // The instruction doesn't actually take any operands though.
+ BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+
+
MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
@@ -24536,12 +25066,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Load the offset value into a register
OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, UseFPOffset ? 4 : 0)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -24561,12 +25091,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Read the reg_save_area address.
unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 16)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 16)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -24588,13 +25118,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Store it back into the va_list.
BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, UseFPOffset ? 4 : 0)
- .addOperand(Segment)
- .addReg(NextOffsetReg)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(MMOBegin, MMOEnd);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -24608,12 +25138,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Load the overflow_area address into a register.
unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 8)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
@@ -24644,13 +25174,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Store the new overflow address.
BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 8)
- .addOperand(Segment)
- .addReg(NextAddrReg)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(MMOBegin, MMOEnd);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
@@ -24867,7 +25397,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
//
// (CMOV (CMOV F, T, cc1), T, cc2)
//
- // to two successives branches. For that, we look for another CMOV as the
+ // to two successive branches. For that, we look for another CMOV as the
// following instruction.
//
// Without this, we would add a PHI between the two jumps, which ends up
@@ -25123,12 +25653,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
// instruction using the same address operands.
if (Operand.isReg())
Operand.setIsKill(false);
- MIB.addOperand(Operand);
+ MIB.add(Operand);
}
MachineInstr *FOpMI = MIB;
MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
@@ -25508,7 +26038,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
else
- MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+ MIB.add(MI.getOperand(MemOpndSlot + i));
}
if (!UseImmLabel)
MIB.addReg(LabelReg);
@@ -25591,7 +26121,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
// Reload FP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload IP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
@@ -25599,7 +26129,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), LabelOffset);
else
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload SP
@@ -25608,7 +26138,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), SPOffset);
else
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Jump
@@ -25625,7 +26155,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
@@ -25644,8 +26174,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
VR = MRI->createVirtualRegister(TRC);
Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
- /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
-
if (Subtarget.is64Bit())
BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
.addReg(X86::RIP)
@@ -25655,7 +26183,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
.addReg(0);
else
BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
- .addReg(0) /* XII->getGlobalBaseReg(MF) */
+ .addReg(0) /* TII->getGlobalBaseReg(MF) */
.addImm(1)
.addReg(0)
.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
@@ -25677,7 +26205,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
int FI = MFI.getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
@@ -25749,9 +26277,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
- const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
- const X86RegisterInfo &RI = XII->getRegisterInfo();
-
+ const X86RegisterInfo &RI = TII->getRegisterInfo();
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered.
if (RI.hasBasePointer(*MF)) {
@@ -25799,8 +26325,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
// N.B. the order the invoke BBs are processed in doesn't matter here.
SmallVector<MachineBasicBlock *, 64> MBBLPads;
- const MCPhysReg *SavedRegs =
- Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+ const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
for (MachineBasicBlock *MBB : InvokeBBs) {
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
@@ -26033,6 +26558,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
case X86::MONITORX:
return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+
+ // Cache line zero
+ case X86::CLZERO:
+ return emitClzero(&MI, BB, Subtarget);
+
// PKU feature
case X86::WRPKRU:
return emitWRPKRU(MI, BB, Subtarget);
@@ -26137,10 +26667,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt &KnownZero,
APInt &KnownOne,
+ const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = KnownZero.getBitWidth();
unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
Opc == ISD::INTRINSIC_WO_CHAIN ||
Opc == ISD::INTRINSIC_W_CHAIN ||
@@ -26167,44 +26699,91 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
LLVM_FALLTHROUGH;
case X86ISD::SETCC:
- KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ KnownZero.setBits(1, BitWidth);
break;
case X86ISD::MOVMSK: {
unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
- KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+ KnownZero.setBits(NumLoBits, BitWidth);
+ break;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
+ KnownZero = APInt::getAllOnesValue(BitWidth);
+ break;
+ }
+
+ DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1);
+ unsigned ShAmt = ShiftImm->getZExtValue();
+ if (Opc == X86ISD::VSHLI) {
+ KnownZero = KnownZero << ShAmt;
+ KnownOne = KnownOne << ShAmt;
+ // Low bits are known zero.
+ KnownZero.setLowBits(ShAmt);
+ } else {
+ KnownZero = KnownZero.lshr(ShAmt);
+ KnownOne = KnownOne.lshr(ShAmt);
+ // High bits are known zero.
+ KnownZero.setHighBits(ShAmt);
+ }
+ }
break;
}
case X86ISD::VZEXT: {
SDValue N0 = Op.getOperand(0);
- unsigned NumElts = Op.getValueType().getVectorNumElements();
- unsigned InNumElts = N0.getValueType().getVectorNumElements();
- unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ EVT SrcVT = N0.getValueType();
+ unsigned InNumElts = SrcVT.getVectorNumElements();
+ unsigned InBitWidth = SrcVT.getScalarSizeInBits();
+ assert(InNumElts >= NumElts && "Illegal VZEXT input");
KnownZero = KnownOne = APInt(InBitWidth, 0);
- APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
- DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+ APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
+ DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1);
KnownOne = KnownOne.zext(BitWidth);
KnownZero = KnownZero.zext(BitWidth);
- KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+ KnownZero.setBits(InBitWidth, BitWidth);
break;
}
}
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
- SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
- // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
- if (Op.getOpcode() == X86ISD::SETCC_CARRY)
- return Op.getScalarValueSizeInBits();
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned VTBits = Op.getScalarValueSizeInBits();
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case X86ISD::SETCC_CARRY:
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ return VTBits;
- if (Op.getOpcode() == X86ISD::VSEXT) {
- EVT VT = Op.getValueType();
- EVT SrcVT = Op.getOperand(0).getValueType();
- unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
- Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ case X86ISD::VSEXT: {
+ SDValue Src = Op.getOperand(0);
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ Tmp += VTBits - Src.getScalarValueSizeInBits();
return Tmp;
}
+ case X86ISD::VSRAI: {
+ SDValue Src = Op.getOperand(0);
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ ShiftVal += Tmp;
+ return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+ }
+
+ case X86ISD::PCMPGT:
+ case X86ISD::PCMPEQ:
+ case X86ISD::CMPP:
+ case X86ISD::VPCOM:
+ case X86ISD::VPCOMU:
+ // Vector compares return zero/all-bits result values.
+ return VTBits;
+ }
+
// Fallback case.
return 1;
}
@@ -26228,24 +26807,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
- // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
- if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
- isUndefOrEqual(Mask[0], 0) &&
- isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
- }
-
- // Match against a VZEXT instruction.
- // TODO: Add 256/512-bit vector support.
- if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+ // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool Match = true;
@@ -26255,19 +26827,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
if (Match) {
- SrcVT = MaskVT;
+ unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+ SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
+ if (SrcVT != MaskVT)
+ V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
- Shuffle = X86ISD::VZEXT;
+ Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
+ : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
return true;
}
}
}
+ // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+ if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+ isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+
// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
- if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+ if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
@@ -26285,7 +26870,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- if (MaskVT.is256BitVector() && FloatDomain) {
+ if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
@@ -26304,7 +26889,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- if (MaskVT.is512BitVector() && FloatDomain) {
+ if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -26343,24 +26928,26 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
+ bool AllowFloatDomain,
+ bool AllowIntDomain,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
bool ContainsZeros = false;
- SmallBitVector Zeroable(NumMaskElts, false);
+ APInt Zeroable(NumMaskElts, false);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
- Zeroable[i] = isUndefOrZero(M);
+ if (isUndefOrZero(M))
+ Zeroable.setBit(i);
ContainsZeros |= (M == SM_SentinelZero);
}
// Attempt to match against byte/bit shifts.
// FIXME: Add 512-bit support.
- if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
MaskVT.getScalarSizeInBits(), Mask,
0, Zeroable, Subtarget);
@@ -26423,19 +27010,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
- if (FloatDomain && !Subtarget.hasAVX())
+ if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
return false;
// Pre-AVX2 we must use float shuffles on 256-bit vectors.
- if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
- FloatDomain = true;
+ if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
+ AllowFloatDomain = true;
+ AllowIntDomain = false;
+ }
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
Shuffle = X86ISD::VPERMI;
- ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+ ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
}
@@ -26443,7 +27032,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
- ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+ ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
return true;
}
@@ -26452,7 +27041,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// VPERMILPD can permute with a non-repeating shuffle.
- if (FloatDomain && MaskScalarSizeInBits == 64) {
+ if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
@@ -26476,8 +27065,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskScalarSizeInBits == 64)
scaleShuffleMask(2, RepeatedMask, WordMask);
- Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
- ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+ Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+ ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
PermuteImm = getV4X86ShuffleImm(WordMask);
return true;
@@ -26487,34 +27076,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain, SDValue &V1, SDValue &V2,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
- if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVLHPS;
ShuffleVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
- (FloatDomain || !Subtarget.hasSSE41())) {
+ (AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
ShuffleVT = MaskVT;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
- (FloatDomain || !Subtarget.hasSSE41())) {
+ (AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
ShuffleVT = MaskVT;
return true;
@@ -26527,57 +27118,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
- MVT LegalVT = MaskVT;
- if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
- LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
-
- SmallVector<int, 64> Unpckl, Unpckh;
- if (IsUnary) {
- createUnpackShuffleMask(MaskVT, Unpckl, true, true);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- V2 = V1;
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- createUnpackShuffleMask(MaskVT, Unpckh, false, true);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- V2 = V1;
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
- } else {
- createUnpackShuffleMask(MaskVT, Unpckl, true, false);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- createUnpackShuffleMask(MaskVT, Unpckh, false, false);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
-
- ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- std::swap(V1, V2);
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- std::swap(V1, V2);
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
+ if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
+ DAG, Subtarget)) {
+ ShuffleVT = MaskVT;
+ if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
+ ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+ return true;
}
}
@@ -26585,17 +27131,19 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
- SDValue &V1, SDValue &V2,
- SDLoc &DL, SelectionDAG &DAG,
+ bool AllowFloatDomain,
+ bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
// Attempt to match against PALIGNR byte rotate.
- if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
@@ -26606,77 +27154,74 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to combine to X86ISD::BLENDI.
- if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
- (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
- // Determine a type compatible with X86ISD::BLENDI.
- // TODO - add 16i16 support (requires lane duplication).
- MVT BlendVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (BlendVT == MVT::v4i64)
- BlendVT = MVT::v8i32;
- else if (BlendVT == MVT::v2i64)
- BlendVT = MVT::v4i32;
- } else {
- if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
- BlendVT = MVT::v8i16;
- else if (BlendVT == MVT::v4i64)
- BlendVT = MVT::v4f64;
- else if (BlendVT == MVT::v8i32)
- BlendVT = MVT::v8f32;
- }
-
- unsigned BlendSize = BlendVT.getVectorNumElements();
- unsigned MaskRatio = BlendSize / NumMaskElts;
-
- // Can we blend with zero?
- if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
- /*Low*/ 0) &&
- NumMaskElts <= BlendVT.getVectorNumElements()) {
- PermuteImm = 0;
- for (unsigned i = 0; i != BlendSize; ++i)
- if (Mask[i / MaskRatio] < 0)
- PermuteImm |= 1u << i;
-
- V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
- Shuffle = X86ISD::BLENDI;
- ShuffleVT = BlendVT;
- return true;
- }
-
- // Attempt to match as a binary blend.
- if (NumMaskElts <= BlendVT.getVectorNumElements()) {
- bool MatchBlend = true;
- for (int i = 0; i != (int)NumMaskElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- else if (M == SM_SentinelZero)
- MatchBlend = false;
- else if ((M != i) && (M != (i + (int)NumMaskElts)))
- MatchBlend = false;
- }
+ if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+ (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+ (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+ if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
+ BlendMask)) {
+ if (MaskVT == MVT::v16i16) {
+ // We can only use v16i16 PBLENDW if the lanes are repeated.
+ SmallVector<int, 8> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+ RepeatedMask)) {
+ assert(RepeatedMask.size() == 8 &&
+ "Repeated mask size doesn't match!");
+ PermuteImm = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 8)
+ PermuteImm |= 1 << i;
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ } else {
+ // Determine a type compatible with X86ISD::BLENDI.
+ ShuffleVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v8i32;
+ else if (ShuffleVT == MVT::v2i64)
+ ShuffleVT = MVT::v4i32;
+ } else {
+ if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+ ShuffleVT = MVT::v8i16;
+ else if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v4f64;
+ else if (ShuffleVT == MVT::v8i32)
+ ShuffleVT = MVT::v8f32;
+ }
- if (MatchBlend) {
- PermuteImm = 0;
- for (unsigned i = 0; i != BlendSize; ++i)
- if ((int)NumMaskElts <= Mask[i / MaskRatio])
- PermuteImm |= 1u << i;
+ if (!ShuffleVT.isFloatingPoint()) {
+ int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+ BlendMask =
+ scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+ ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+ }
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
- ShuffleVT = BlendVT;
return true;
}
}
}
// Attempt to combine to INSERTPS.
- if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
- SmallBitVector Zeroable(4, false);
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector()) {
+ APInt Zeroable(4, 0);
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] < 0)
- Zeroable[i] = true;
+ Zeroable.setBit(i);
- if (Zeroable.any() &&
+ if (Zeroable.getBoolValue() &&
matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
@@ -26685,22 +27230,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to combine to SHUFPD.
- if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
- (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
- (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+ if (AllowFloatDomain && EltSizeInBits == 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
- ShuffleVT = MaskVT;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
}
}
// Attempt to combine to SHUFPS.
- if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
- (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+ if (AllowFloatDomain && EltSizeInBits == 32 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
SmallVector<int, 4> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ // Match each half of the repeated mask, to determine if its just
+ // referencing one of the vectors, is zeroable or entirely undef.
auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
int M0 = RepeatedMask[Offset];
int M1 = RepeatedMask[Offset + 1];
@@ -26732,7 +27281,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
V1 = Lo;
V2 = Hi;
Shuffle = X86ISD::SHUFP;
- ShuffleVT = MaskVT;
+ ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
PermuteImm = getV4X86ShuffleImm(ShufMask);
return true;
}
@@ -26764,7 +27313,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
SDValue V1 = peekThroughBitcasts(Inputs[0]);
- SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+ SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+ : peekThroughBitcasts(Inputs[1]));
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
@@ -26853,6 +27403,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MVT ShuffleSrcVT, ShuffleVT;
unsigned Shuffle, PermuteImm;
+ // Which shuffle domains are permitted?
+ // Permit domain crossing at higher combine depths.
+ bool AllowFloatDomain = FloatDomain || (Depth > 3);
+ bool AllowIntDomain = !FloatDomain || (Depth > 3);
+
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
// directly if we don't shuffle the lower element and we shuffle the upper
@@ -26869,8 +27424,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT)) {
+ if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+ V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26884,8 +27440,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
- Shuffle, ShuffleVT, PermuteImm)) {
+ if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle,
+ ShuffleVT, PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26901,8 +27458,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
- Shuffle, ShuffleVT, UnaryShuffle)) {
+ if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+ V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
+ UnaryShuffle)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26918,8 +27476,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
- DAG, Subtarget, Shuffle, ShuffleVT,
+ if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+ AllowIntDomain, V1, V2, DL, DAG,
+ Subtarget, Shuffle, ShuffleVT,
PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
@@ -27039,12 +27598,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
- SmallBitVector UndefElts(NumMaskElts, false);
+ APInt UndefElts(NumMaskElts, 0);
SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
}
if (M == SM_SentinelZero)
@@ -27228,8 +27787,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
// Extract constant bits from each source op.
bool OneUseConstantOp = false;
- SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
- SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+ SmallVector<APInt, 16> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
for (unsigned i = 0; i != NumOps; ++i) {
SDValue SrcOp = Ops[i];
OneUseConstantOp |= SrcOp.hasOneUse();
@@ -27245,18 +27804,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
return false;
// Shuffle the constant bits according to the mask.
- SmallBitVector UndefElts(NumMaskElts, false);
- SmallBitVector ZeroElts(NumMaskElts, false);
- SmallBitVector ConstantElts(NumMaskElts, false);
+ APInt UndefElts(NumMaskElts, 0);
+ APInt ZeroElts(NumMaskElts, 0);
+ APInt ConstantElts(NumMaskElts, 0);
SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
APInt::getNullValue(MaskSizeInBits));
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
} else if (M == SM_SentinelZero) {
- ZeroElts[i] = true;
+ ZeroElts.setBit(i);
continue;
}
assert(0 <= M && M < (int)(NumMaskElts * NumOps));
@@ -27266,21 +27825,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
if (SrcUndefElts[SrcMaskIdx]) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
}
auto &SrcEltBits = RawBitsOps[SrcOpIdx];
APInt &Bits = SrcEltBits[SrcMaskIdx];
if (!Bits) {
- ZeroElts[i] = true;
+ ZeroElts.setBit(i);
continue;
}
- ConstantElts[i] = true;
+ ConstantElts.setBit(i);
ConstantBitData[i] = Bits;
}
- assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+ assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
// Create the constant data.
MVT MaskSVT;
@@ -27330,6 +27889,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask,
+ ArrayRef<const SDNode*> SrcNodes,
int Depth, bool HasVariableMask,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -27353,13 +27913,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
- SDValue Input0, Input1;
- SmallVector<int, 16> OpMask;
- if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+ SmallVector<int, 64> OpMask;
+ SmallVector<SDValue, 2> OpInputs;
+ if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
return false;
+ assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
+ SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
+ SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
+
// Add the inputs to the Ops list, avoiding duplicates.
- SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+ SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
int InputIdx0 = -1, InputIdx1 = -1;
for (int i = 0, e = Ops.size(); i < e; ++i) {
@@ -27392,8 +27956,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
(RootRatio == 1) != (OpRatio == 1)) &&
"Must not have a ratio for both incoming and op masks!");
- SmallVector<int, 16> Mask;
- Mask.reserve(MaskWidth);
+ SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by the
@@ -27403,7 +27966,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
int RootIdx = i / RootRatio;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
- Mask.push_back(RootMask[RootIdx]);
+ Mask[i] = RootMask[RootIdx];
continue;
}
@@ -27413,7 +27976,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
// than the SrcOp we're currently inserting.
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
- Mask.push_back(RootMaskedIdx);
+ Mask[i] = RootMaskedIdx;
continue;
}
@@ -27423,7 +27986,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
// are using.
- Mask.push_back(OpMask[OpIdx]);
+ Mask[i] = OpMask[OpIdx];
continue;
}
@@ -27439,7 +28002,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
OpMaskedIdx += InputIdx1 * MaskWidth;
}
- Mask.push_back(OpMaskedIdx);
+ Mask[i] = OpMaskedIdx;
}
// Handle the all undef/zero cases early.
@@ -27457,28 +28020,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
}
// Remove unused shuffle source ops.
- SmallVector<SDValue, 8> UsedOps;
- for (int i = 0, e = Ops.size(); i < e; ++i) {
- int lo = UsedOps.size() * MaskWidth;
- int hi = lo + MaskWidth;
- if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
- UsedOps.push_back(Ops[i]);
- continue;
- }
- for (int &M : Mask)
- if (lo <= M)
- M -= MaskWidth;
- }
- assert(!UsedOps.empty() && "Shuffle with no inputs detected");
- Ops = UsedOps;
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
- // See if we can recurse into each shuffle source op (if it's a target shuffle).
+ // Update the list of shuffle nodes that have been combined so far.
+ SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+ SrcNodes.end());
+ CombinedNodes.push_back(Op.getNode());
+
+ // See if we can recurse into each shuffle source op (if it's a target
+ // shuffle). The source op should only be combined if it either has a
+ // single use (i.e. current Op) or all its users have already been combined.
for (int i = 0, e = Ops.size(); i < e; ++i)
- if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
- if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
- HasVariableMask, DAG, DCI, Subtarget))
+ if (Ops[i].getNode()->hasOneUse() ||
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+ if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+ Depth + 1, HasVariableMask, DAG, DCI,
+ Subtarget))
return true;
// Attempt to constant fold all of the constant source ops.
@@ -27495,7 +28055,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- SmallVector<int, 16> WidenedMask;
+ SmallVector<int, 64> WidenedMask;
while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
Mask = std::move(WidenedMask);
}
@@ -27561,8 +28121,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
/// altering anything.
static SDValue
combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
- SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG) {
assert(N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!");
SDLoc DL(N);
@@ -27842,19 +28401,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
- bool isFloat = VT.isFloatingPoint();
SDValue V0 = peekThroughBitcasts(N->getOperand(0));
SDValue V1 = peekThroughBitcasts(N->getOperand(1));
- bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
- bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
- assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+ if (isZero0 && isZero1)
+ return SDValue();
// We often lower to MOVSD/MOVSS from integer as well as native float
// types; remove unnecessary domain-crossing bitcasts if we can to make it
// easier to combine shuffles later on. We've already accounted for the
// domain switching cost when we decided to lower with it.
+ bool isFloat = VT.isFloatingPoint();
+ bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+ bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
@@ -28025,7 +28585,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
break;
case X86ISD::PSHUFD:
- if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
return NewN;
break;
@@ -28173,12 +28733,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
-
- // Don't create instructions with illegal types after legalize types has run.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
- return SDValue();
-
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
if (TLI.isTypeLegal(VT))
@@ -28249,11 +28804,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
SmallVector<SDValue, 16> Elts;
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
- Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ Elts.clear();
+ break;
+ }
- if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
- return LD;
+ if (Elts.size() == VT.getVectorNumElements())
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+ return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -28276,7 +28838,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// a particular chain.
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
@@ -28303,18 +28865,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EVT OriginalVT = InVec.getValueType();
- if (InVec.getOpcode() == ISD::BITCAST) {
- // Don't duplicate a load with other uses.
- if (!InVec.hasOneUse())
- return SDValue();
- EVT BCVT = InVec.getOperand(0).getValueType();
- if (!BCVT.isVector() ||
- BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
- return SDValue();
- InVec = InVec.getOperand(0);
- }
+ // Peek through bitcasts, don't duplicate a load with other uses.
+ InVec = peekThroughOneUseBitcasts(InVec);
EVT CurrentVT = InVec.getValueType();
+ if (!CurrentVT.isVector() ||
+ CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ return SDValue();
if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();
@@ -28393,19 +28950,41 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+
+ // Since MMX types are special and don't usually play with other vector types,
+ // it's better to handle them early to be sure we emit efficient code by
+ // avoiding store-load conversions.
- // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
- // special and don't usually play with other vector types, it's better to
- // handle them early to be sure we emit efficient code by avoiding
- // store-load conversions.
+ // Detect bitcasts between i32 to x86mmx low word.
if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
- N0.getValueType() == MVT::v2i32 &&
- isNullConstant(N0.getOperand(1))) {
+ SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
SDValue N00 = N0->getOperand(0);
if (N00.getValueType() == MVT::i32)
return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
}
+ // Detect bitcasts between element or subvector extraction to x86mmx.
+ if (VT == MVT::x86mmx &&
+ (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N00.getValueType().is128BitVector())
+ return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+ DAG.getBitcast(MVT::v2i64, N00));
+ }
+
+ // Detect bitcasts from FP_TO_SINT to x86mmx.
+ if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
+ N0.getOpcode() == ISD::FP_TO_SINT) {
+ SDLoc DL(N0);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+ DAG.getBitcast(MVT::v2i64, Res));
+ }
+
// Convert a bitcasted integer logic operation that has one bitcasted
// floating-point operand into a floating-point logic operation. This may
// create a load of a constant, but that is cheaper than materializing the
@@ -28511,12 +29090,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
if (SetCC.getOpcode() != ISD::SETCC)
return false;
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT)
+ if (CC != ISD::SETGT && CC != ISD::SETLT)
return false;
SDValue SelectOp1 = Select->getOperand(1);
SDValue SelectOp2 = Select->getOperand(2);
+ // The following instructions assume SelectOp1 is the subtraction operand
+ // and SelectOp2 is the negation operand.
+ // In the case of SETLT this is the other way around.
+ if (CC == ISD::SETLT)
+ std::swap(SelectOp1, SelectOp2);
+
// The second operand of the select should be the negation of the first
// operand, which is implemented as 0 - SelectOp1.
if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -28529,8 +29114,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
if (SetCC.getOperand(0) != SelectOp1)
return false;
- // The second operand of the comparison can be either -1 or 0.
- if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+ // In SetLT case, The second operand of the comparison can be either 1 or 0.
+ APInt SplatVal;
+ if ((CC == ISD::SETLT) &&
+ !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+ SplatVal == 1) ||
+ (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+ return false;
+
+ // In SetGT case, The second operand of the comparison can be either -1 or 0.
+ if ((CC == ISD::SETGT) &&
+ !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
return false;
@@ -28576,17 +29170,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}
+// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+static SDValue combineHorizontalPredicateResult(SDNode *Extract,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE2 or with AVX512VL (which uses predicate registers).
+ if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ unsigned BitWidth = ExtractVT.getSizeInBits();
+ if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+ ExtractVT != MVT::i8)
+ return SDValue();
+
+ // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+ for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
+ SDValue Match = matchBinOpReduction(Extract, Op);
+ if (!Match)
+ continue;
+
+ // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+ // which we can't support here for now.
+ if (Match.getScalarValueSizeInBits() != BitWidth)
+ continue;
+
+ // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 &&
+ ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+ return SDValue();
+
+ // Don't bother performing this for 2-element vectors.
+ if (Match.getValueType().getVectorNumElements() <= 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ APInt CompareBits;
+ ISD::CondCode CondCode;
+ if (Op == ISD::OR) {
+ // any_of -> MOVMSK != 0
+ CompareBits = APInt::getNullValue(32);
+ CondCode = ISD::CondCode::SETNE;
+ } else {
+ // all_of -> MOVMSK == ((1 << NumElts) - 1)
+ CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CondCode = ISD::CondCode::SETEQ;
+ }
+
+ // Perform the select as i32/i64 and then truncate to avoid partial register
+ // stalls.
+ unsigned ResWidth = std::max(BitWidth, 32u);
+ EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+ SDLoc DL(Extract);
+ SDValue Zero = DAG.getConstant(0, DL, ResVT);
+ SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+ SDValue Res = DAG.getBitcast(MaskVT, Match);
+ Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+ Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+ Ones, Zero, CondCode);
+ return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ }
+
+ return SDValue();
+}
+
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// PSADBW is only supported on SSE2 and up.
if (!Subtarget.hasSSE2())
return SDValue();
- // Verify the type we're extracting from is appropriate
- // TODO: There's nothing special about i32, any integer type above i16 should
- // work just as well.
+ // Verify the type we're extracting from is any integer type above i16.
EVT VT = Extract->getOperand(0).getValueType();
- if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+ if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
return SDValue();
unsigned RegSize = 128;
@@ -28595,15 +29264,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
else if (Subtarget.hasAVX2())
RegSize = 256;
- // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
- if (VT.getSizeInBits() / 4 > RegSize)
+ if (RegSize / VT.getVectorNumElements() < 8)
return SDValue();
// Match shuffle + add pyramid.
SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+ // The operand is expected to be zero extended from i8
+ // (verified in detectZextAbsDiff).
+ // In order to convert to i64 and above, additional any/zero/sign
+ // extend is expected.
+ // The zero extend from 32 bit has no mathematical effect on the result.
+ // Also the sign extend is basically zero extend
+ // (extends the sign bit which is zero).
+ // So it is correct to skip the sign/zero extend instruction.
+ if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
+ Root = Root.getOperand(0);
+
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -28614,7 +29296,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
if (!detectZextAbsDiff(Root, Zext0, Zext1))
return SDValue();
- // Create the SAD instruction
+ // Create the SAD instruction.
SDLoc DL(Extract);
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
@@ -28636,13 +29318,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
}
}
- // Return the lowest i32.
- MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+ MVT Type = Extract->getSimpleValueType(0);
+ unsigned TypeSizeInBits = Type.getSizeInBits();
+ // Return the lowest TypeSizeInBits bits.
+ MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
Extract->getOperand(1));
}
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Src = N->getOperand(0);
+ SDValue Idx = N->getOperand(1);
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+ // Don't attempt this for boolean mask vectors or unknown extraction indices.
+ if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ // Resolve the target shuffle inputs and mask.
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
+ return SDValue();
+
+ // Attempt to narrow/widen the shuffle mask to the correct size.
+ if (Mask.size() != NumSrcElts) {
+ if ((NumSrcElts % Mask.size()) == 0) {
+ SmallVector<int, 16> ScaledMask;
+ int Scale = NumSrcElts / Mask.size();
+ scaleShuffleMask(Scale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
+ } else if ((Mask.size() % NumSrcElts) == 0) {
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > NumSrcElts &&
+ canWidenShuffleElements(Mask, WidenedMask))
+ Mask = std::move(WidenedMask);
+ // TODO - investigate support for wider shuffle masks with known upper
+ // undef/zero elements for implicit zero-extension.
+ }
+ }
+
+ // Check if narrowing/widening failed.
+ if (Mask.size() != NumSrcElts)
+ return SDValue();
+
+ int SrcIdx = Mask[N->getConstantOperandVal(1)];
+ SDLoc dl(N);
+
+ // If the shuffle source element is undef/zero then we can just accept it.
+ if (SrcIdx == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+
+ if (SrcIdx == SM_SentinelZero)
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+ SrcIdx = SrcIdx % Mask.size();
+
+ // We can only extract other elements from 128-bit vectors and in certain
+ // circumstances, depending on SSE-level.
+ // TODO: Investigate using extract_subvector for larger vectors.
+ // TODO: Investigate float/double extraction if it will be just stored.
+ if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+ ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+ assert(SrcSVT == VT && "Unexpected extraction type");
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+ DAG.getIntPtrConstant(SrcIdx, dl));
+ }
+
+ if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
+ "Unexpected extraction type");
+ unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+ DAG.getIntPtrConstant(SrcIdx, dl));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
+ DAG.getValueType(SrcSVT));
+ return DAG.getZExtOrTrunc(Assert, dl, VT);
+ }
+
+ return SDValue();
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -28653,14 +29425,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ return NewOp;
+
SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
+
+ // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+ SDValue MMXSrc = InputVector.getOperand(0);
+
+ // The bitcast source is a direct mmx result.
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getBitcast(VT, InputVector);
+ }
+
// Detect mmx to i32 conversion through a v2i32 elt extract.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
- N->getValueType(0) == MVT::i32 &&
- InputVector.getValueType() == MVT::v2i32 &&
- isa<ConstantSDNode>(N->getOperand(1)) &&
- N->getConstantOperandVal(1) == 0) {
+ VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
@@ -28668,15 +29455,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
- InputVector.getOpcode() == ISD::BITCAST &&
+ if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
+ isa<ConstantSDNode>(EltIdx) &&
isa<ConstantSDNode>(InputVector.getOperand(0))) {
- uint64_t ExtractedElt =
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- uint64_t InputValue =
- cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ uint64_t ExtractedElt = N->getConstantOperandVal(1);
+ uint64_t InputValue = InputVector.getConstantOperandVal(0);
uint64_t Res = (InputValue >> ExtractedElt) & 1;
return DAG.getConstant(Res, dl, MVT::i1);
}
@@ -28687,9 +29470,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
return SAD;
+ // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+ if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+ return Cmp;
+
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
- if (InputVector.getValueType() != MVT::v4i32)
+ if (SrcVT != MVT::v4i32)
return SDValue();
// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
@@ -28717,9 +29504,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Record which element was extracted.
- ExtractedElements |=
- 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
-
+ ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
Uses.push_back(Extract);
}
@@ -28752,11 +29537,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
} else {
// Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
MachinePointerInfo());
- EVT ElementType = InputVector.getValueType().getVectorElementType();
+ EVT ElementType = SrcVT.getVectorElementType();
unsigned EltSize = ElementType.getSizeInBits() / 8;
// Replace each use (extract) with a load of the appropriate element.
@@ -28779,8 +29564,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
UE = Uses.end(); UI != UE; ++UI) {
SDNode *Extract = *UI;
- SDValue Idx = Extract->getOperand(1);
- uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ uint64_t IdxVal = Extract->getConstantOperandVal(1);
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}
@@ -28788,6 +29572,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// TODO - merge with combineExtractVectorElt once it can handle the implicit
+// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
+// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+// combineBasicSADPattern.
+static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
+}
+
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
static SDValue
@@ -28812,12 +29606,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
// This situation only applies to avx512.
if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
CondVT.getVectorElementType() == MVT::i1) {
- //Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
- DL, CondVT));
- //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ // Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getAllOnesConstant(DL, CondVT));
+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
}
// To use the condition operand as a bitwise mask, it must have elements that
@@ -28920,18 +29713,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(ShAmt, DL, MVT::i8));
}
- // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
- if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
- return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
- SDValue(FalseC, 0));
- }
-
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
@@ -29049,7 +29830,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return false;
MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
// Only change element size, not type.
- if (VT.isInteger() != OpEltVT.isInteger())
+ if (EltVT.isInteger() != OpEltVT.isInteger())
return false;
uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29063,7 +29844,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
DCI.AddToWorklist(Op1.getNode());
DCI.CombineTo(OrigOp.getNode(),
DAG.getNode(Opcode, DL, VT, Op0, Op1,
- DAG.getConstant(Imm, DL, MVT::i8)));
+ DAG.getIntPtrConstant(Imm, DL)));
return true;
}
case ISD::EXTRACT_SUBVECTOR: {
@@ -29072,7 +29853,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return false;
MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
// Only change element size, not type.
- if (VT.isInteger() != OpEltVT.isInteger())
+ if (EltVT.isInteger() != OpEltVT.isInteger())
return false;
uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29084,7 +29865,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
DCI.AddToWorklist(Op0.getNode());
DCI.CombineTo(OrigOp.getNode(),
DAG.getNode(Opcode, DL, VT, Op0,
- DAG.getConstant(Imm, DL, MVT::i8)));
+ DAG.getIntPtrConstant(Imm, DL)));
+ return true;
+ }
+ case X86ISD::SUBV_BROADCAST: {
+ unsigned EltSize = EltVT.getSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ // Only change element size, not type.
+ if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+ return false;
+ SDValue Op0 = Op.getOperand(0);
+ MVT Op0VT = MVT::getVectorVT(EltVT,
+ Op0.getSimpleValueType().getSizeInBits() / EltSize);
+ Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
+ DCI.AddToWorklist(Op0.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0));
return true;
}
}
@@ -29370,8 +30167,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If this is a *dynamic* select (non-constant condition) and we can match
// this node with one of the variable blend instructions, restructure the
- // condition so that the blends can use the high bit of each element and use
- // SimplifyDemandedBits to simplify the condition operand.
+ // condition so that blends can use the high (sign) bit of each element and
+ // use SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
@@ -29406,49 +30203,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
- APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
-
+ APInt DemandedMask(APInt::getSignBit(BitWidth));
APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
TLO)) {
- // If we changed the computation somewhere in the DAG, this change
- // will affect all users of Cond.
- // Make sure it is fine and update all the nodes so that we do not
- // use the generic VSELECT anymore. Otherwise, we may perform
- // wrong optimizations as we messed up with the actual expectation
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Make sure it is fine and update all the nodes
+ // so that we do not use the generic VSELECT anymore. Otherwise, we may
+ // perform wrong optimizations as we messed with the actual expectation
// for the vector boolean values.
if (Cond != TLO.Old) {
- // Check all uses of that condition operand to check whether it will be
- // consumed by non-BLEND instructions, which may depend on all bits are
- // set properly.
- for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
- I != E; ++I)
- if (I->getOpcode() != ISD::VSELECT)
- // TODO: Add other opcodes eventually lowered into BLEND.
+ // Check all uses of the condition operand to check whether it will be
+ // consumed by non-BLEND instructions. Those may require that all bits
+ // are set properly.
+ for (SDNode *U : Cond->uses()) {
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ if (U->getOpcode() != ISD::VSELECT)
return SDValue();
+ }
- // Update all the users of the condition, before committing the change,
- // so that the VSELECT optimizations that expect the correct vector
- // boolean value will not be triggered.
- for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
- I != E; ++I)
- DAG.ReplaceAllUsesOfValueWith(
- SDValue(*I, 0),
- DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
- Cond, I->getOperand(1), I->getOperand(2)));
+ // Update all users of the condition before committing the change, so
+ // that the VSELECT optimizations that expect the correct vector boolean
+ // value will not be triggered.
+ for (SDNode *U : Cond->uses()) {
+ SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
+ U->getValueType(0), Cond, U->getOperand(1),
+ U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+ }
DCI.CommitTargetLoweringOpt(TLO);
return SDValue();
}
- // At this point, only Cond is changed. Change the condition
- // just for N to keep the opportunity to optimize all other
- // users their own way.
- DAG.ReplaceAllUsesOfValueWith(
- SDValue(N, 0),
- DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
- TLO.New, N->getOperand(1), N->getOperand(2)));
+ // Only Cond (rather than other nodes in the computation chain) was
+ // changed. Change the condition just for N to keep the opportunity to
+ // optimize all other users their own way.
+ SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
return SDValue();
}
}
@@ -29456,7 +30249,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Look for vselects with LHS/RHS being bitcasted from an operation that
// can be executed on another type. Push the bitcast to the inputs of
// the operation. This exposes opportunities for using masking instructions.
- if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+ if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
CondVT.getVectorElementType() == MVT::i1) {
if (combineBitcastForMaskedOp(LHS, DAG, DCI))
return SDValue(N, 0);
@@ -30208,22 +31001,37 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
}
if (!NewMul) {
- assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
- && "Both cases that could cause potential overflows should have "
- "already been handled.");
- if (isPowerOf2_64(MulAmt - 1))
- // (mul x, 2^N + 1) => (add (shl x, N), x)
- NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt - 1), DL,
- MVT::i8)));
-
- else if (isPowerOf2_64(MulAmt + 1))
- // (mul x, 2^N - 1) => (sub (shl x, N), x)
- NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
- N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt + 1),
- DL, MVT::i8)), N->getOperand(0));
+ assert(MulAmt != 0 &&
+ MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+ "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ int64_t SignMulAmt = C->getSExtValue();
+ if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
+ (SignMulAmt != -INT64_MAX)) {
+ int NumSign = SignMulAmt > 0 ? 1 : -1;
+ bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
+ bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
+ if (IsPowerOf2_64PlusOne) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
+ MVT::i8)));
+ } else if (IsPowerOf2_64MinusOne) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::SUB, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
+ MVT::i8)),
+ N->getOperand(0));
+ }
+ // To negate, subtract the number from zero
+ if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
+ NewMul =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+ }
}
if (NewMul)
@@ -30396,42 +31204,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
- "Unexpected opcode");
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
+ X86ISD::VSRLI == Opcode) &&
+ "Unexpected shift opcode");
+ bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
- // This fails for mask register (vXi1) shifts.
- if ((NumBitsPerElt % 8) != 0)
- return SDValue();
+ assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+ "Unexpected value type");
// Out of range logical bit shifts are guaranteed to be zero.
- APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
- if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
- return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ // Out of range arithmetic bit shifts splat the sign bit.
+ APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
+ if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
+ if (LogicalShift)
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ else
+ ShiftVal = NumBitsPerElt - 1;
+ }
// Shift N0 by zero -> N0.
if (!ShiftVal)
- return N->getOperand(0);
+ return N0;
// Shift zero -> zero.
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
+ // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
+ // TODO - support other sra opcodes as needed.
+ if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
+ N0.getOpcode() == X86ISD::VSRAI)
+ return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+
// We can decode 'whole byte' logical bit shifts as shuffles.
- if ((ShiftVal.getZExtValue() % 8) == 0) {
+ if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
}
+ // Constant Folding.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (N->isOnlyUserOf(N0.getNode()) &&
+ getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
+ assert(EltBits.size() == VT.getVectorNumElements() &&
+ "Unexpected shift value type");
+ unsigned ShiftImm = ShiftVal.getZExtValue();
+ for (APInt &Elt : EltBits) {
+ if (X86ISD::VSHLI == Opcode)
+ Elt = Elt.shl(ShiftImm);
+ else if (X86ISD::VSRAI == Opcode)
+ Elt = Elt.ashr(ShiftImm);
+ else
+ Elt = Elt.lshr(ShiftImm);
+ }
+ return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(
+ ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW &&
+ N->getValueType(0) == MVT::v8i16)) &&
+ "Unexpected vector insertion");
+
+ // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget);
return SDValue();
}
@@ -30550,33 +31411,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
return SDValue();
- // Canonicalize XOR to the left.
- if (N1.getOpcode() == ISD::XOR)
- std::swap(N0, N1);
+ if (N0.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
- if (N0.getOpcode() != ISD::XOR)
- return SDValue();
-
- SDValue N00 = N0->getOperand(0);
- SDValue N01 = N0->getOperand(1);
+ if (N1.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
- N01 = peekThroughBitcasts(N01);
-
- // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
- // insert_subvector building a 256-bit AllOnes vector.
- if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
- if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
- return SDValue();
-
- SDValue V1 = N01->getOperand(0);
- SDValue V2 = N01->getOperand(1);
- if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
- !V1.getOperand(0).isUndef() ||
- !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
- !ISD::isBuildVectorAllOnes(V2.getNode()))
- return SDValue();
- }
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+ return SDValue();
}
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -30696,38 +31539,34 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
-/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
-/// eliminate loading the vector constant mask value. This relies on the fact
-/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
-static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
- // TODO: Use AssertSext to mark any nodes that have the property of producing
- // all-ones or all-zeros. Then check for that node rather than particular
- // opcodes.
- if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+ if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
return SDValue();
- // The existence of the PCMP node guarantees that we have the required SSE2 or
- // AVX2 for a shift of this vector type, but there is no vector shift by
- // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
- // masked compare nodes, so they should not make it here.
- EVT VT0 = Op0.getValueType();
- EVT VT1 = Op1.getValueType();
- unsigned EltBitWidth = VT0.getScalarSizeInBits();
- if (VT0 != VT1 || EltBitWidth == 8)
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
+ !SplatVal.isMask())
return SDValue();
- assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+ if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+ return SDValue();
- APInt SplatVal;
- if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+ unsigned EltBitWidth = VT0.getScalarSizeInBits();
+ if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
return SDValue();
SDLoc DL(N);
- SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+ unsigned ShiftVal = SplatVal.countTrailingOnes();
+ SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
@@ -30747,7 +31586,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
- if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
EVT VT = N->getValueType(0);
@@ -30760,7 +31599,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
@@ -30969,7 +31808,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
N->getOperand(1).getOpcode() == X86ISD::CMP &&
- N->getOperand(1).getConstantOperandVal(1) == 0 &&
+ isNullConstant(N->getOperand(1).getOperand(1)) &&
N->getOperand(1).getValueType().bitsGE(MVT::i32);
};
@@ -31272,6 +32111,74 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasAVX512())
+ return false;
+
+ // FIXME: Scalar type may be supported if we move it to vector register.
+ if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+ return false;
+
+ EVT SrcElVT = SrcVT.getScalarType();
+ EVT DstElVT = DstVT.getScalarType();
+ if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+ return false;
+ if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+ return false;
+ if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+ return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+ return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
+ if (In.getOpcode() != ISD::UMIN)
+ return SDValue();
+
+ //Saturation with truncation. We truncate from InVT to VT.
+ assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
+ "Unexpected types for truncate operation");
+
+ APInt C;
+ if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+ // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
+ SDValue();
+ }
+ return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return SDValue();
+ return detectUSatPattern(In, VT);
+}
+
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
+ return SDValue();
+ if (auto USatVal = detectUSatPattern(In, VT))
+ if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ return SDValue();
+}
+
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
@@ -31664,7 +32571,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WideSrc0,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
- SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
}
@@ -31838,6 +32745,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
+ if (SDValue Val =
+ detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+ return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
@@ -32198,13 +33111,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
- auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
- // TODO: Add extra cases where we can truncate both inputs for the
- // cost of one (or none).
- // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
+ unsigned TruncSizeInBits = VT.getScalarSizeInBits();
+
+ // Repeated operand, so we are only trading one output truncation for
+ // one input truncation.
if (Op0 == Op1)
return true;
+ // See if either operand has been extended from a smaller/equal size to
+ // the truncation size, allowing a truncation to combine with the extend.
+ unsigned Opcode0 = Op0.getOpcode();
+ if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
+ Opcode0 == ISD::ZERO_EXTEND) &&
+ Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ return true;
+
+ unsigned Opcode1 = Op1.getOpcode();
+ if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
+ Opcode1 == ISD::ZERO_EXTEND) &&
+ Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ return true;
+
+ // See if either operand is a single use constant which can be constant
+ // folded.
SDValue BC0 = peekThroughOneUseBitcasts(Op0);
SDValue BC1 = peekThroughOneUseBitcasts(Op1);
return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
@@ -32236,7 +33166,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
- IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ IsRepeatedOpOrFreeTruncation(Op0, Op1))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -32252,7 +33182,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
- IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ IsRepeatedOpOrFreeTruncation(Op0, Op1))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -32458,6 +33388,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
+ // Try to combine truncation with unsigned saturation.
+ if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+ return Val;
+
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -32804,6 +33738,34 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
}
+/// Do target-specific dag combines on X86ISD::ANDNP nodes.
+static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // ANDNP(0, x) -> x
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return N->getOperand(1);
+
+ // ANDNP(x, 0) -> 0
+ if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+ return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+
+ EVT VT = N->getValueType(0);
+
+ // Attempt to recursively combine a bitmask ANDNP with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
+}
+
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// BT ignores high bits in the bit index operand.
@@ -33065,13 +34027,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps()) {
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
+ if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
+ isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
+ // Invert and sign-extend a boolean is the same as zero-extend and subtract
+ // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
+ // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
+ // sext (xor Bool, -1) --> sub (zext Bool), 1
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
+ }
+
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -33212,8 +34183,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Optimize x == -y --> x+y == 0
-/// x != -y --> x+y != 0
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+ // We're looking for an oversized integer equality comparison, but ignore a
+ // comparison with zero because that gets special treatment in EmitTest().
+ SDValue X = SetCC->getOperand(0);
+ SDValue Y = SetCC->getOperand(1);
+ EVT OpVT = X.getValueType();
+ unsigned OpSize = OpVT.getSizeInBits();
+ if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+ return SDValue();
+
+ // TODO: Use PXOR + PTEST for SSE4.1 or later?
+ // TODO: Add support for AVX-512.
+ EVT VT = SetCC->getValueType(0);
+ SDLoc DL(SetCC);
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX2())) {
+ EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+ SDValue VecX = DAG.getBitcast(VecVT, X);
+ SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+ // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+ // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+ // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+ // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
+ // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+ SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+ SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+ SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
+ MVT::i32);
+ return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+ }
+
+ return SDValue();
+}
+
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -33222,21 +34232,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDLoc DL(N);
- if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
- if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
- LHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
+ if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+ EVT OpVT = LHS.getValueType();
+ // 0-x == y --> x+y == 0
+ // 0-x != y --> x+y != 0
+ if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+ LHS.hasOneUse()) {
+ SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
+ return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
- if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
- if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
- RHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
+ // x == 0-y --> x+y == 0
+ // x != 0-y --> x+y != 0
+ if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+ RHS.hasOneUse()) {
+ SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+ return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
+ if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+ return V;
+ }
+
if (VT.getScalarType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
bool IsSEXT0 =
@@ -33293,56 +34309,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Helper function of performSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
-// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
- SelectionDAG &DAG, MVT VT) {
- if (VT == MVT::i8)
- return DAG.getNode(ISD::AND, DL, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- EFLAGS),
- DAG.getConstant(1, DL, VT));
- assert (VT == MVT::i1 && "Unexpected type for SECCC node");
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- EFLAGS));
-}
-
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
- if (CC == X86::COND_A) {
- // Try to convert COND_A into COND_B in an attempt to facilitate
- // materializing "setb reg".
- //
- // Do not flip "e > c", where "c" is a constant, because Cmp instruction
- // cannot take an immediate as its first operand.
- //
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
- EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
- return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
- }
- }
-
- // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
- // a zext and produces an all-ones bit which is more useful than 0/1 in some
- // cases.
- if (CC == X86::COND_B)
- return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
-
// Try to simplify the EFLAGS and condition code operands.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
return getSETCC(CC, Flags, DL, DAG);
@@ -33352,7 +34325,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
/// Optimize branch condition evaluation.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue EFLAGS = N->getOperand(3);
@@ -33538,45 +34510,159 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// fold (add Y, (sete X, 0)) -> adc 0, Y
-/// (add Y, (setne X, 0)) -> sbb -1, Y
-/// (sub (sete X, 0), Y) -> sbb 0, Y
-/// (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
+/// which is more useful than 0/1 in some cases.
+static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
SDLoc DL(N);
+ // "Condition code B" is also known as "the carry flag" (CF).
+ SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
+ SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
+ MVT VT = N->getSimpleValueType(0);
+ if (VT == MVT::i8)
+ return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
- // Look through ZExts.
- SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
- if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
- return SDValue();
+ assert(VT == MVT::i1 && "Unexpected type for SETCC node");
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+ bool IsSub = N->getOpcode() == ISD::SUB;
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+
+ // If this is an add, canonicalize a zext operand to the RHS.
+ // TODO: Incomplete? What if both sides are zexts?
+ if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+ Y.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(X, Y);
+
+ // Look through a one-use zext.
+ bool PeekedThroughZext = false;
+ if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+ Y = Y.getOperand(0);
+ PeekedThroughZext = true;
+ }
- SDValue SetCC = Ext.getOperand(0);
- if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+ // If this is an add, canonicalize a setcc operand to the RHS.
+ // TODO: Incomplete? What if both sides are setcc?
+ // TODO: Should we allow peeking through a zext of the other operand?
+ if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+ Y.getOpcode() != X86ISD::SETCC)
+ std::swap(X, Y);
+
+ if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
return SDValue();
- X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+ if (CC == X86::COND_B) {
+ // X + SETB Z --> X + (mask SBB Z, Z)
+ // X - SETB Z --> X - (mask SBB Z, Z)
+ // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
+ SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
+ if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+ SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+ return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+ }
+
+ if (CC == X86::COND_A) {
+ SDValue EFLAGS = Y->getOperand(1);
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
+ if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+ SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+ return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+ }
+ }
+
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
- SDValue Cmp = SetCC.getOperand(1);
+ SDValue Cmp = Y.getOperand(1);
if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
!X86::isZeroNode(Cmp.getOperand(1)) ||
!Cmp.getOperand(0).getValueType().isInteger())
return SDValue();
- SDValue CmpOp0 = Cmp.getOperand(0);
- SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
- DAG.getConstant(1, DL, CmpOp0.getValueType()));
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue Z = Cmp.getOperand(0);
+ SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
+ DAG.getConstant(1, DL, Z.getValueType()));
+
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
- SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+ // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+ // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
- return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
- DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
- NewCmp);
- return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
- DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+ DAG.getConstant(-1ULL, DL, VT), NewCmp);
+
+ // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
+ // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+ DAG.getConstant(0, DL, VT), NewCmp);
+}
+
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue MulOp = N->getOperand(0);
+ SDValue Phi = N->getOperand(1);
+
+ if (MulOp.getOpcode() != ISD::MUL)
+ std::swap(MulOp, Phi);
+ if (MulOp.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+ unsigned VectorSize = VT.getVectorNumElements() * 16;
+ // If the vector size is less than 128, or greater than the supported RegSize,
+ // do not use PMADD.
+ if (VectorSize < 128 || VectorSize > RegSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements() / 2);
+
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
+ // Madd vector size is half of the original vector size
+ SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+ // Fill the rest of the output with 0
+ SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
}
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -33656,6 +34742,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (Flags->hasVectorReduction()) {
if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
+ if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+ return MAdd;
}
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
@@ -33667,7 +34755,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
- return OptimizeConditionalInDecrement(N, DAG);
+ return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
@@ -33700,36 +34788,44 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
- return OptimizeConditionalInDecrement(N, DAG);
+ return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
SDLoc DL(N);
unsigned Opcode = N->getOpcode();
MVT VT = N->getSimpleValueType(0);
MVT SVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
SDValue Op = N->getOperand(0);
MVT OpVT = Op.getSimpleValueType();
MVT OpEltVT = OpVT.getVectorElementType();
- unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+ unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
+ unsigned InputBits = OpEltSizeInBits * NumElts;
// Perform any constant folding.
// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
- if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- unsigned NumDstElts = VT.getVectorNumElements();
- SmallBitVector Undefs(NumDstElts, false);
- SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
- for (unsigned i = 0; i != NumDstElts; ++i) {
- SDValue OpElt = Op.getOperand(i);
- if (OpElt.getOpcode() == ISD::UNDEF) {
- Undefs[i] = true;
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
+ APInt Undefs(NumElts, 0);
+ SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
+ bool IsZEXT =
+ (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ Undefs.setBit(i);
continue;
}
- APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
- Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
- : Cst.sextOrTrunc(SVT.getSizeInBits());
+ Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
+ : EltBits[i].sextOrTrunc(EltSizeInBits);
}
return getConstVector(Vals, Undefs, VT, DAG, DL);
}
@@ -33829,7 +34925,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
if (N->getOperand(0) == N->getOperand(1)) {
if (N->getOpcode() == X86ISD::PCMPEQ)
- return getOnesVector(VT, Subtarget, DAG, DL);
+ return getOnesVector(VT, DAG, DL);
if (N->getOpcode() == X86ISD::PCMPGT)
return getZeroVector(VT, Subtarget, DAG, DL);
}
@@ -33837,6 +34933,98 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Vec = N->getOperand(0);
+ SDValue SubVec = N->getOperand(1);
+ SDValue Idx = N->getOperand(2);
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = N->getSimpleValueType(0);
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // If this is an insert of an extract, combine to a shuffle. Don't do this
+ // if the insert or extract can be represented with a subvector operation.
+ if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+ (IdxVal != 0 || !Vec.isUndef())) {
+ int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+ if (ExtIdxVal != 0) {
+ int VecNumElts = OpVT.getVectorNumElements();
+ int SubVecNumElts = SubVecVT.getVectorNumElements();
+ SmallVector<int, 64> Mask(VecNumElts);
+ // First create an identity shuffle mask.
+ for (int i = 0; i != VecNumElts; ++i)
+ Mask[i] = i;
+ // Now insert the extracted portion.
+ for (int i = 0; i != SubVecNumElts; ++i)
+ Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+ return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+ }
+ }
+
+ // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+ // load:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr + 16), Elts/2)
+ // --> load32 addr
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr + 32), Elts/2)
+ // --> load64 addr
+ // or a 16-byte or 32-byte broadcast:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr), Elts/2)
+ // --> X86SubVBroadcast(load16 addr)
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr), Elts/2)
+ // --> X86SubVBroadcast(load32 addr)
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+ auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+ if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+ bool Fast;
+ unsigned Alignment = FirstLd->getAlignment();
+ unsigned AS = FirstLd->getAddressSpace();
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ OpVT, AS, Alignment, &Fast) && Fast) {
+ SDValue Ops[] = {SubVec2, SubVec};
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+ return Ld;
+ }
+ }
+ // If lower/upper loads are the same and the only users of the load, then
+ // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+ if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+ if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+ SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+ }
+ }
+ // If this is subv_broadcast insert into both halves, use a larger
+ // subv_broadcast.
+ if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+ SubVec.getOperand(0));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -33845,6 +35033,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
default: break;
case ISD::EXTRACT_VECTOR_ELT:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case X86ISD::PEXTRW:
+ case X86ISD::PEXTRB:
+ return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_SUBVECTOR:
+ return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
@@ -33870,6 +35063,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -33884,12 +35078,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
- case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
- case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
+ case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
+ case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::VSHLI:
- case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case X86ISD::VSEXT:
case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
case X86ISD::PALIGNR:
@@ -34717,10 +35917,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return Res;
}
- // 'A' means EAX + EDX.
+ // 'A' means [ER]AX + [ER]DX.
if (Constraint == "A") {
- Res.first = X86::EAX;
- Res.second = &X86::GR32_ADRegClass;
+ if (Subtarget.is64Bit()) {
+ Res.first = X86::RAX;
+ Res.second = &X86::GR64_ADRegClass;
+ } else if (Subtarget.is32Bit()) {
+ Res.first = X86::EAX;
+ Res.second = &X86::GR32_ADRegClass;
+ } else if (Subtarget.is16Bit()) {
+ Res.first = X86::AX;
+ Res.second = &X86::GR16_ADRegClass;
+ } else {
+ llvm_unreachable("Expecting 64, 32 or 16 bit subtarget");
+ }
return Res;
}
return Res;
@@ -34812,7 +36022,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on x86 is expensive. However, when aggressively optimizing
// for code size, we prefer to use a div instruction, as it is usually smaller
// than the alternative sequence.
@@ -34820,8 +36030,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
- Attribute::MinSize);
+ bool OptSize =
+ Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}