diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 4020 |
1 files changed, 2615 insertions, 1405 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 08fe2bad281e..7ff483063ec2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -53,6 +53,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include <algorithm> #include <bitset> @@ -70,6 +71,13 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); +static cl::opt<int> ExperimentalPrefLoopAlignment( + "x86-experimental-pref-loop-alignment", cl::init(4), + cl::desc("Sets the preferable loop alignment for experiments " + "(the last x86-experimental-pref-loop-alignment bits" + " of the loop header PC will be 0)."), + cl::Hidden); + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -427,7 +435,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } - // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + + // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; @@ -782,6 +791,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -888,6 +898,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); @@ -922,6 +935,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); @@ -1065,6 +1086,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { + setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); @@ -1126,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } @@ -1271,6 +1293,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } if (Subtarget.hasVLX()) { + setOperationAction(ISD::ABS, MVT::v4i64, Legal); + setOperationAction(ISD::ABS, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1357,16 +1381,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v16i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i64, Legal); - setOperationAction(ISD::ADD, MVT::v8i1, Expand); - setOperationAction(ISD::ADD, MVT::v16i1, Expand); - setOperationAction(ISD::SUB, MVT::v8i1, Expand); - setOperationAction(ISD::SUB, MVT::v16i1, Expand); - setOperationAction(ISD::MUL, MVT::v8i1, Expand); - setOperationAction(ISD::MUL, MVT::v16i1, Expand); + setOperationAction(ISD::ADD, MVT::v8i1, Custom); + setOperationAction(ISD::ADD, MVT::v16i1, Custom); + setOperationAction(ISD::SUB, MVT::v8i1, Custom); + setOperationAction(ISD::SUB, MVT::v16i1, Custom); + setOperationAction(ISD::MUL, MVT::v8i1, Custom); + setOperationAction(ISD::MUL, MVT::v16i1, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); @@ -1441,7 +1466,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Legal); @@ -1460,12 +1485,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::ADD, MVT::v32i1, Expand); - setOperationAction(ISD::ADD, MVT::v64i1, Expand); - setOperationAction(ISD::SUB, MVT::v32i1, Expand); - setOperationAction(ISD::SUB, MVT::v64i1, Expand); - setOperationAction(ISD::MUL, MVT::v32i1, Expand); - setOperationAction(ISD::MUL, MVT::v64i1, Expand); + setOperationAction(ISD::ADD, MVT::v32i1, Custom); + setOperationAction(ISD::ADD, MVT::v64i1, Custom); + setOperationAction(ISD::SUB, MVT::v32i1, Custom); + setOperationAction(ISD::SUB, MVT::v64i1, Custom); + setOperationAction(ISD::MUL, MVT::v32i1, Custom); + setOperationAction(ISD::MUL, MVT::v64i1, Custom); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); @@ -1479,8 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); @@ -1546,6 +1571,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); @@ -1574,9 +1600,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v2i1, &X86::VK2RegClass); for (auto VT : { MVT::v2i1, MVT::v4i1 }) { - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); @@ -1671,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); @@ -1696,6 +1723,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -1712,7 +1741,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - setPrefLoopAlignment(4); // 2^4 bytes. + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). + setPrefLoopAlignment(ExperimentalPrefLoopAlignment); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. @@ -1933,6 +1963,34 @@ bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } +void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const { + + // Only relabel X86-32 for C / Stdcall CCs. + if (Subtarget.is64Bit()) + return; + if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) + return; + unsigned ParamRegs = 0; + if (auto *M = MF->getFunction()->getParent()) + ParamRegs = M->getNumberRegisterParameters(); + + // Mark the first N int arguments as having reg + for (unsigned Idx = 0; Idx < Args.size(); Idx++) { + Type *T = Args[Idx].Ty; + if (T->isPointerTy() || T->isIntegerTy()) + if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { + unsigned numRegs = 1; + if (MF->getDataLayout().getTypeAllocSize(T) > 4) + numRegs = 2; + if (ParamRegs < numRegs) + return; + ParamRegs -= numRegs; + Args[Idx].IsInReg = true; + } + } +} + const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, @@ -2001,21 +2059,37 @@ unsigned X86TargetLowering::getAddressSpace() const { return 256; } -Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - // glibc has a special slot for the stack guard in tcbhead_t, use it instead - // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h) - if (!Subtarget.isTargetGlibc()) - return TargetLowering::getIRStackGuard(IRB); - - // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x14 on i386 - unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; - unsigned AddressSpace = getAddressSpace(); +static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { + return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || + (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); +} + +static Constant* SegmentOffset(IRBuilder<> &IRB, + unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } +Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // glibc, bionic, and Fuchsia have a special slot for the stack guard in + // tcbhead_t; use it instead of the usual global variable (see + // sysdeps/{i386,x86_64}/nptl/tls.h) + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + return SegmentOffset(IRB, 0x10, getAddressSpace()); + } else { + // %fs:0x28, unless we're using a Kernel code model, in which case + // it's %gs:0x28. gs:0x14 on i386. + unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } + } + + return TargetLowering::getIRStackGuard(IRB); +} + void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isOSMSVCRT()) { @@ -2027,13 +2101,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { auto *SecurityCheckCookie = cast<Function>( M.getOrInsertFunction("__security_check_cookie", Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()), nullptr)); + Type::getInt8PtrTy(M.getContext()))); SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); return; } - // glibc has a special slot for the stack guard. - if (Subtarget.isTargetGlibc()) + // glibc, bionic, and Fuchsia have a special slot for the stack guard. + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } @@ -2056,21 +2130,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); - if (!Subtarget.isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); - // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - unsigned AddressSpace, Offset; + if (Subtarget.isTargetAndroid()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x24 on i386 + unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } - // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x24 on i386 - Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; - AddressSpace = getAddressSpace(); - return ConstantExpr::getIntToPtr( - ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); + // Fuchsia is similar. + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + return SegmentOffset(IRB, 0x18, getAddressSpace()); + } + + return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -2179,6 +2255,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); + + // Add the register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); + SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); @@ -2253,6 +2334,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); + + // Add the second register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } @@ -2309,6 +2394,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + + // Add the returned register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -2444,7 +2533,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, // Convert the i32 type into v32i1 type Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); - // Concantenate the two values together + // Concatenate the two values together return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } @@ -2488,8 +2577,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, + uint32_t *RegMask) const { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget.is64Bit(); @@ -2503,6 +2594,14 @@ SDValue X86TargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); + // In some calling conventions we need to remove the used registers + // from the register mask. + if (RegMask && CallConv == CallingConv::X86_RegCall) { + for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + } + // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { @@ -2669,6 +2768,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; + MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same @@ -2686,13 +2786,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { - const X86Subtarget& Subtarget = - static_cast<const X86Subtarget&>(DAG.getSubtarget()); // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + if (Subtarget.is64Bit() && Ins.size() == 2) { + // The stack pointer needs to be realigned for 64 bit handlers with error + // code, so the argument offset changes by 8 bytes. + Offset += 8; + } } // FIXME: For now, all byval parameter objects are marked mutable. This can be @@ -2707,30 +2810,71 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, if (CallConv == CallingConv::X86_INTR) { MFI.setObjectOffset(FI, Offset); } - return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - } else { - int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable); - - // Set SExt or ZExt flag. - if (VA.getLocInfo() == CCValAssign::ZExt) { - MFI.setObjectZExt(FI, true); - } else if (VA.getLocInfo() == CCValAssign::SExt) { - MFI.setObjectSExt(FI, true); + return DAG.getFrameIndex(FI, PtrVT); + } + + // This is an argument in memory. We might be able to perform copy elision. + if (Flags.isCopyElisionCandidate()) { + EVT ArgVT = Ins[i].ArgVT; + SDValue PartAddr; + if (Ins[i].PartOffset == 0) { + // If this is a one-part value or the first part of a multi-part value, + // create a stack object for the entire argument value type and return a + // load from our portion of it. This assumes that if the first part of an + // argument is in memory, the rest will also be in memory. + int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), + /*Immutable=*/false); + PartAddr = DAG.getFrameIndex(FI, PtrVT); + return DAG.getLoad( + ValVT, dl, Chain, PartAddr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + // This is not the first piece of an argument in memory. See if there is + // already a fixed stack object including this offset. If so, assume it + // was created by the PartOffset == 0 branch above and create a load from + // the appropriate offset into it. + int64_t PartBegin = VA.getLocMemOffset(); + int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; + int FI = MFI.getObjectIndexBegin(); + for (; MFI.isFixedObjectIndex(FI); ++FI) { + int64_t ObjBegin = MFI.getObjectOffset(FI); + int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); + if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) + break; + } + if (MFI.isFixedObjectIndex(FI)) { + SDValue Addr = + DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), + DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); + return DAG.getLoad( + ValVT, dl, Chain, Addr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, + Ins[i].PartOffset)); + } } + } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } + int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, + VA.getLocMemOffset(), isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad( - ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? - DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; + // Set SExt or ZExt flag. + if (VA.getLocInfo() == CCValAssign::ZExt) { + MFI.setObjectZExt(FI, true); + } else if (VA.getLocInfo() == CCValAssign::SExt) { + MFI.setObjectSExt(FI, true); + } + + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI.setObjectOffset(FI, Offset); } + + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) + : Val; } // FIXME: Get this from tablegen. @@ -2781,12 +2925,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +#ifndef NDEBUG static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } +#endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2836,8 +2982,8 @@ SDValue X86TargetLowering::LowerFormalArguments( // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; @@ -2853,7 +2999,7 @@ SDValue X86TargetLowering::LowerFormalArguments( "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are - // compiled to 32 bit arch, are splited up into two registers. + // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { @@ -3107,8 +3253,9 @@ SDValue X86TargetLowering::LowerFormalArguments( MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { - // X86 interrupts must pop the error code if present - FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); + // X86 interrupts must pop the error code (and the alignment padding) if + // present. + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. @@ -3146,6 +3293,12 @@ SDValue X86TargetLowering::LowerFormalArguments( } } + if (CallConv == CallingConv::X86_RegCall) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end())) + MF.getRegInfo().disableCalleeSavedRegister(Pair.first); + } + return Chain; } @@ -3348,8 +3501,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. @@ -3517,7 +3670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && - "Expecting custome case only in regcall calling convention"); + "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; @@ -3662,7 +3815,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Mask = RegInfo->getNoPreservedMask(); } - Ops.push_back(DAG.getRegisterMask(Mask)); + // Define a new register mask from the existing mask. + uint32_t *RegMask = nullptr; + + // In some calling conventions we need to remove the used physical registers + // from the reg mask. + if (CallConv == CallingConv::X86_RegCall) { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // Allocate a new Reg Mask and copy Mask. + RegMask = MF.allocateRegisterMask(TRI->getNumRegs()); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize); + + // Make sure all sub registers of the argument registers are reset + // in the RegMask. + for (auto const &RegPair : RegsToPass) + for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + + // Create the RegMask Operand according to our updated mask. + Ops.push_back(DAG.getRegisterMask(RegMask)); + } else { + // Create the RegMask Operand according to the static mask. + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -3715,8 +3893,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, - Ins, dl, DAG, InVals); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, RegMask); } //===----------------------------------------------------------------------===// @@ -4132,6 +4310,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { return true; // 'Faux' Target Shuffles. case ISD::AND: + case X86ISD::ANDNP: return true; } } @@ -4448,6 +4627,11 @@ bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } +bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + return true; +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -4460,6 +4644,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { return true; } +MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { + MVT VT = MVT::getIntegerVT(NumBits); + if (isTypeLegal(VT)) + return VT; + + // PMOVMSKB can handle this. + if (NumBits == 128 && isTypeLegal(MVT::v16i8)) + return MVT::v16i8; + + // VPMOVMSKB can handle this. + if (NumBits == 256 && isTypeLegal(MVT::v32i8)) + return MVT::v32i8; + + // TODO: Allow 64-bit type for 32-bit target. + // TODO: 512-bit types should be allowed, but make sure that those + // cases are handled in combineVectorSizedSetCCEquality(). + + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); @@ -4555,28 +4759,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, SmallVectorImpl<int> &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + int M0 = Mask[i]; + int M1 = Mask[i + 1]; + // If both elements are undef, its trivial. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && - Mask[i + 1] % 2 == 1) { - WidenedMask[i / 2] = Mask[i + 1] / 2; + if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { + WidenedMask[i / 2] = M1 / 2; continue; } - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { + WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { + if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && + (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } @@ -4585,9 +4791,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, // Finally check if the two mask values are adjacent and aligned with // a pair. - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && - Mask[i] + 1 == Mask[i + 1]) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { + WidenedMask[i / 2] = M0 / 2; continue; } @@ -4770,9 +4975,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } -static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs, +static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); + assert(Bits.size() == Undefs.getBitWidth() && + "Unequal constant and undef arrays"); SmallVector<SDValue, 32> Ops; bool Split = false; @@ -4844,10 +5050,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); - // Extract from UNDEF is UNDEF. - if (Vec.isUndef()) - return DAG.getUNDEF(ResultVT); - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); @@ -4918,50 +5120,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - - // For insertion into the zero index (low half) of a 256-bit vector, it is - // more efficient to generate a blend with immediate instead of an insert*128. - // We are still creating an INSERT_SUBVECTOR below with an undef node to - // extend the subvector to the size of the result vector. Make sure that - // we are not recursing on that node by checking for undef here. - if (IdxVal == 0 && Result.getValueType().is256BitVector() && - !Result.isUndef()) { - EVT ResultVT = Result.getValueType(); - SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResultVT); - SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, - Vec, ZeroIndex); - - // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); - if (ScalarType.isFloatingPoint()) { - // Choose either vblendps (float) or vblendpd (double). - unsigned ScalarSize = ScalarType.getSizeInBits(); - assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); - unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); - } - - const X86Subtarget &Subtarget = - static_cast<const X86Subtarget &>(DAG.getSubtarget()); - - // AVX2 is needed for 256-bit integer blend support. - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); - Result = DAG.getBitcast(CastVT, Result); - Vec256 = DAG.getBitcast(CastVT, Vec256); - Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); - return DAG.getBitcast(ResultVT, Vec256); - } - return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } @@ -5023,7 +5181,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { if (IdxVal != 0) { SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + ShiftBits); } return ExtractSubVec(WideSubVec); } @@ -5032,9 +5191,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; return ExtractSubVec(Vec); } @@ -5043,8 +5202,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), @@ -5056,12 +5215,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); return ExtractSubVec(Vec); } @@ -5094,26 +5253,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, } /// Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with -/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, - SelectionDAG &DAG, const SDLoc &dl) { +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; - SDValue Vec; - if (!Subtarget.hasInt256() && NumElts == 8) { - Vec = DAG.getConstant(Ones, dl, MVT::v4i32); - Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); - } else { - Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); - } + SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } +static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, + SelectionDAG &DAG) { + EVT InVT = In.getValueType(); + assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); + + if (VT.is128BitVector() && InVT.is128BitVector()) + return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT) + : DAG.getZeroExtendVectorInReg(In, DL, VT); + + // For 256-bit vectors, we only need the lower (128-bit) input half. + // For 512-bit vectors, we only need the lower input half or quarter. + if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { + int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); + In = extractSubVector(In, 0, DAG, DL, + std::max(128, (int)VT.getSizeInBits() / Scale)); + } + + return DAG.getNode(Opc, DL, VT, In); +} + /// Generate unpacklo/unpackhi shuffle mask. static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, bool Unary) { @@ -5199,9 +5370,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, - SmallBitVector &UndefElts, - SmallVectorImpl<APInt> &EltBits) { - assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + APInt &UndefElts, + SmallVectorImpl<APInt> &EltBits, + bool AllowWholeUndefs = true, + bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); @@ -5211,56 +5383,83 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); // Split the undef/constant single bitset data into the target elements. auto SplitBitData = [&]() { - UndefElts = SmallBitVector(NumElts, false); + // Don't split if we don't allow undef bits. + bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; + if (UndefBits.getBoolValue() && !AllowUndefs) + return false; + + UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { - APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); - UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + unsigned BitOffset = i * EltSizeInBits; + APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); - // Only treat an element as UNDEF if all bits are UNDEF, otherwise - // treat it as zero. + // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { - UndefElts[i] = true; + if (!AllowWholeUndefs) + return false; + UndefElts.setBit(i); continue; } - APInt Bits = MaskBits.lshr(i * EltSizeInBits); - Bits = Bits.zextOrTrunc(EltSizeInBits); + // If only some bits are UNDEF then treat them as zero (or bail if not + // supported). + if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) + return false; + + APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); EltBits[i] = Bits.getZExtValue(); } return true; }; - auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask, - APInt &Undefs) { + // Collect constant bits and insert into mask/undef bit masks. + auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, + unsigned BitOffset) { if (!Cst) return false; unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits(); if (isa<UndefValue>(Cst)) { - Mask = APInt::getNullValue(SizeInBits); - Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits); + Undefs.setBits(BitOffset, BitOffset + CstSizeInBits); return true; } if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { - Mask = CInt->getValue().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask.insertBits(CInt->getValue(), BitOffset); return true; } if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { - Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset); return true; } return false; }; + // Extract constant bits from build vector. + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + const SDValue &Src = Op.getOperand(i); + unsigned BitOffset = i * SrcEltSizeInBits; + if (Src.isUndef()) { + UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); + continue; + } + auto *Cst = cast<ConstantSDNode>(Src); + APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); + MaskBits.insertBits(Bits, BitOffset); + } + return SplitBitData(); + } + // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); @@ -5268,117 +5467,59 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); - for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { - APInt Bits, Undefs; - if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs)) + for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) + if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits, + i * CstEltSizeInBits)) return false; - MaskBits |= Bits.shl(i * CstEltSizeInBits); - UndefBits |= Undefs.shl(i * CstEltSizeInBits); - } return SplitBitData(); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && - EltSizeInBits <= Op.getScalarValueSizeInBits()) { + EltSizeInBits <= SrcEltSizeInBits) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { - APInt Bits, Undefs; - if (ExtractConstantBits(Broadcast, Bits, Undefs)) { - unsigned NumBroadcastBits = Op.getScalarValueSizeInBits(); - unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits; - for (unsigned i = 0; i != NumBroadcastElts; ++i) { - MaskBits |= Bits.shl(i * NumBroadcastBits); - UndefBits |= Undefs.shl(i * NumBroadcastBits); + APInt Bits(SizeInBits, 0); + APInt Undefs(SizeInBits, 0); + if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) { + for (unsigned i = 0; i != NumSrcElts; ++i) { + MaskBits |= Bits.shl(i * SrcEltSizeInBits); + UndefBits |= Undefs.shl(i * SrcEltSizeInBits); } return SplitBitData(); } } } + // Extract a rematerialized scalar constant insertion. + if (Op.getOpcode() == X86ISD::VZEXT_MOVL && + Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) { + auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0)); + MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); + MaskBits = MaskBits.zext(SizeInBits); + return SplitBitData(); + } + return false; } -// TODO: Merge more of this with getTargetConstantBitsFromNode. static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl<uint64_t> &RawMask) { - MaskNode = peekThroughBitcasts(MaskNode); - - MVT VT = MaskNode.getSimpleValueType(); - assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); - unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits; - - // Split an APInt element into MaskEltSizeInBits sized pieces and - // insert into the shuffle mask. - auto SplitElementToMask = [&](APInt Element) { - // Note that this is x86 and so always little endian: the low byte is - // the first byte of the mask. - int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits; - for (int i = 0; i < Split; ++i) { - APInt RawElt = Element.getLoBits(MaskEltSizeInBits); - Element = Element.lshr(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - }; - - if (MaskNode.getOpcode() == X86ISD::VBROADCAST) { - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0 - if (VT.getScalarSizeInBits() != MaskEltSizeInBits) - return false; - if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) { - const APInt &MaskElement = CN->getAPIntValue(); - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - } + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + + // Extract the raw target constant bits. + // FIXME: We currently don't support UNDEF bits or mask entries. + if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, + EltBits, /* AllowWholeUndefs */ false, + /* AllowPartialUndefs */ false)) return false; - } - if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && - MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { - SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); - if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { - if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) { - RawMask.push_back(CN->getZExtValue()); - RawMask.append(NumMaskElts - 1, 0); - return true; - } - - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; - SplitElementToMask(CN->getAPIntValue()); - RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); - return true; - } - } - return false; - } - - if (MaskNode.getOpcode() != ISD::BUILD_VECTOR) - return false; - - // We can always decode if the buildvector is all zero constants, - // but can't use isBuildVectorAllZeros as it might contain UNDEFs. - if (all_of(MaskNode->ops(), X86::isZeroNode)) { - RawMask.append(NumMaskElts, 0); - return true; - } - - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) - return false; - - for (SDValue Op : MaskNode->ops()) { - if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode())) - SplitElementToMask(CN->getAPIntValue()); - else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode())) - SplitElementToMask(CFN->getValueAPF().bitcastToAPInt()); - else - return false; - } + // Insert the extracted elements into the mask. + for (APInt Elt : EltBits) + RawMask.push_back(Elt.getZExtValue()); return true; } @@ -5405,6 +5546,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); @@ -5473,8 +5615,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, IsUnary = true; break; case X86ISD::VBROADCAST: { - // We only decode broadcasts of same-sized vectors at the moment. - if (N->getOperand(0).getValueType() == VT) { + SDValue N0 = N->getOperand(0); + // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, + // add the pre-extracted value to the Ops vector. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == VT && + N0.getConstantOperandVal(1) == 0) + Ops.push_back(N0.getOperand(0)); + + // We only decode broadcasts of same-sized vectors, unless the broadcast + // came from an extract from the original width. If we found one, we + // pushed it the Ops vector above. + if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; @@ -5669,6 +5821,19 @@ static bool setTargetShuffleZeroElements(SDValue N, V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); + assert((VT.getSizeInBits() % Mask.size()) == 0 && + "Illegal split of shuffle value type"); + unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + + // Extract known constant input data. + APInt UndefSrcElts[2]; + SmallVector<APInt, 32> SrcEltBits[2]; + bool IsSrcConstant[2] = { + getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], + SrcEltBits[0], true, false), + getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], + SrcEltBits[1], true, false)}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; @@ -5677,6 +5842,7 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; // Determine shuffle input and normalize the mask. + unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; @@ -5686,39 +5852,27 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; } - // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. - if (V.getOpcode() != ISD::BUILD_VECTOR) - continue; - - // If the BUILD_VECTOR has fewer elements then the (larger) source - // element must be UNDEF/ZERO. - // TODO: Is it worth testing the individual bits of a constant? - if ((Size % V.getNumOperands()) == 0) { - int Scale = Size / V->getNumOperands(); - SDValue Op = V.getOperand(M / Scale); - if (Op.isUndef()) + // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. + // TODO: We currently only set UNDEF for integer types - floats use the same + // registers as vectors and many of the scalar folded loads rely on the + // SCALAR_TO_VECTOR pattern. + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Size % V.getValueType().getVectorNumElements()) == 0) { + int Scale = Size / V.getValueType().getVectorNumElements(); + int Idx = M / Scale; + if (Idx != 0 && !VT.isFloatingPoint()) Mask[i] = SM_SentinelUndef; - else if (X86::isZeroNode(Op)) + else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) Mask[i] = SM_SentinelZero; continue; } - // If the BUILD_VECTOR has more elements then all the (smaller) source - // elements must be all UNDEF or all ZERO. - if ((V.getNumOperands() % Size) == 0) { - int Scale = V->getNumOperands() / Size; - bool AllUndef = true; - bool AllZero = true; - for (int j = 0; j < Scale; ++j) { - SDValue Op = V.getOperand((M * Scale) + j); - AllUndef &= Op.isUndef(); - AllZero &= X86::isZeroNode(Op); - } - if (AllUndef) + // Attempt to extract from the source's constant bits. + if (IsSrcConstant[SrcIdx]) { + if (UndefSrcElts[SrcIdx][M]) Mask[i] = SM_SentinelUndef; - else if (AllZero) + else if (SrcEltBits[SrcIdx][M] == 0) Mask[i] = SM_SentinelZero; - continue; } } @@ -5744,11 +5898,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned Opcode = N.getOpcode(); switch (Opcode) { - case ISD::AND: { + case ISD::AND: + case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. - SmallBitVector UndefElts; + APInt UndefElts; SmallVector<APInt, 32> EltBits; - if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits)) + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + bool IsAndN = (X86ISD::ANDNP == Opcode); + uint64_t ZeroMask = IsAndN ? 255 : 0; + if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { @@ -5758,9 +5917,55 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; - Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i); + Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } - Ops.push_back(N.getOperand(0)); + Ops.push_back(IsAndN ? N1 : N0); + return true; + } + case ISD::SCALAR_TO_VECTOR: { + // Match against a scalar_to_vector of an extract from a similar vector. + SDValue N0 = N.getOperand(0); + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N0.getOperand(0).getValueType() != VT || + !isa<ConstantSDNode>(N0.getOperand(1)) || + NumElts <= N0.getConstantOperandVal(1) || + !N->isOnlyUserOf(N0.getNode())) + return false; + Ops.push_back(N0.getOperand(0)); + Mask.push_back(N0.getConstantOperandVal(1)); + Mask.append(NumElts - 1, SM_SentinelUndef); + return true; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue InVec = N.getOperand(0); + SDValue InScl = N.getOperand(1); + uint64_t InIdx = N.getConstantOperandVal(2); + assert(InIdx < NumElts && "Illegal insertion index"); + + // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. + if (X86::isZeroNode(InScl)) { + Ops.push_back(InVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); + return true; + } + + // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern. + // TODO: Expand this to support INSERT_VECTOR_ELT/etc. + unsigned ExOp = + (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); + if (InScl.getOpcode() != ISD::AssertZext || + InScl.getOperand(0).getOpcode() != ExOp) + return false; + + SDValue ExVec = InScl.getOperand(0).getOperand(0); + uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1); + assert(ExIdx < NumElts && "Illegal extraction index"); + Ops.push_back(InVec); + Ops.push_back(ExVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } case X86ISD::VSHLI: @@ -5795,6 +6000,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, } return true; } + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VZEXT: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); @@ -5810,36 +6016,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return false; } +/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask) { + int MaskWidth = Mask.size(); + SmallVector<SDValue, 16> UsedInputs; + for (int i = 0, e = Inputs.size(); i < e; ++i) { + int lo = UsedInputs.size() * MaskWidth; + int hi = lo + MaskWidth; + if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedInputs.push_back(Inputs[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; + } + Inputs = UsedInputs; +} + /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the -/// Op0/Op1 inputs accordingly. +/// inputs accordingly. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, +static bool resolveTargetShuffleInputs(SDValue Op, + SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask) { - SmallVector<SDValue, 2> Ops; - if (!setTargetShuffleZeroElements(Op, Mask, Ops)) - if (!getFauxShuffleMask(Op, Mask, Ops)) + if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) + if (!getFauxShuffleMask(Op, Mask, Inputs)) return false; - int NumElts = Mask.size(); - bool Op0InUse = any_of(Mask, [NumElts](int Idx) { - return 0 <= Idx && Idx < NumElts; - }); - bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; }); - - Op0 = Op0InUse ? Ops[0] : SDValue(); - Op1 = Op1InUse ? Ops[1] : SDValue(); - - // We're only using Op1 - commute the mask and inputs. - if (!Op0InUse && Op1InUse) { - for (int &M : Mask) - if (NumElts <= M) - M -= NumElts; - Op0 = Op1; - Op1 = SDValue(); - } - + resolveTargetShuffleInputsAndMask(Inputs, Mask); return true; } @@ -5914,10 +6122,9 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, - unsigned NumNonZero, unsigned NumZero, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (NumNonZero > 8) return SDValue(); @@ -5928,18 +6135,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v16i8); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v16i8, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v16i8, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -5958,24 +6173,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { + // FIXME: Investigate extending to i32 instead of just i16. + // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; if (LastIsNonZero) { - LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, - MVT::i16, Op.getOperand(i-1)); + LastElt = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, - ThisElt, DAG.getConstant(8, dl, MVT::i8)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, + DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; - if (ThisElt.getNode()) - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i/2, dl)); + if (ThisElt) { + if (1 == i) { + V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) + : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + } else { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i / 2, dl)); + } + } } } @@ -5986,8 +6212,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + const X86Subtarget &Subtarget) { if (NumNonZero > 4) return SDValue(); @@ -5995,18 +6220,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v8i16); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v8i16, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -6015,8 +6248,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + const X86Subtarget &Subtarget) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { @@ -6212,7 +6444,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, /// /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, - SDLoc &DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { unsigned NumElems = Elts.size(); @@ -6376,14 +6608,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, return SDValue(); } -static Constant *getConstantVector(MVT VT, APInt SplatValue, +static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector<Constant *, 32> ConstantVec; for (unsigned i = 0; i < NumElm; i++) { - APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize); + APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { assert((ScalarSize == 32 || ScalarSize == 64) && @@ -6664,6 +6896,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. if (!isa<ConstantSDNode>(ExtIdx)) return SDValue(); @@ -6694,11 +6927,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); - for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { - unsigned Idx = InsertIndices[i]; + + for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); - } return NV; } @@ -7347,7 +7579,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); } return SDValue(); @@ -7418,7 +7650,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && (!IsAllConstants || Idx == 0)) { - if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; @@ -7561,17 +7793,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) - if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. @@ -7767,7 +7999,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); if (V1.isUndef()) - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); if (IsZeroV1) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); @@ -7956,7 +8188,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } -} + } return true; } @@ -7986,6 +8218,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, return true; } +// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle +// mask. +static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask, + const APInt &Zeroable) { + int NumElts = Mask.size(); + assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); + + SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); + TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); + } + return TargetMask; +} + +// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd +// instructions. +static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return false; + + SmallVector<int, 8> Unpcklwd; + createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, + /* Unary = */ false); + SmallVector<int, 8> Unpckhwd; + createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, + /* Unary = */ false); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || + isTargetShuffleEquivalent(Mask, Unpckhwd)); + return IsUnpackwdMask; +} + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -8009,7 +8276,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { return Imm; } -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } @@ -8022,9 +8289,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); +static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); @@ -8039,7 +8306,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; + Zeroable.setBit(i); continue; } @@ -8057,17 +8324,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable[i] = true; + Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } continue; } @@ -8081,7 +8350,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } - Zeroable[i] = AllZeroable; + if (AllZeroable) + Zeroable.setBit(i); continue; } } @@ -8096,19 +8366,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. -static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, - ArrayRef<int> Mask,const EVT &VectorType, +static bool isNonZeroElementsInOrder(const APInt &Zeroable, + ArrayRef<int> Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. - for (int i = 0, e = Zeroable.size(); i < e; i++) { + for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. - if (Mask[i] == -1) + assert(Mask[i] >= -1 && "Out of bound mask element!"); + if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element - if (NextElement == -1) { + if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } @@ -8124,7 +8395,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8179,19 +8450,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); -// Function convertBitVectorToUnsigned - The function gets SmallBitVector -// as argument and convert him to unsigned. -// The output of the function is not(zeroable) -static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { - unsigned convertBit = 0; - for (int i = 0, e = Zeroable.size(); i < e; i++) - convertBit |= !(Zeroable[i]) << i; - return convertBit; -} - // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8199,7 +8460,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); - unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); @@ -8215,6 +8476,91 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, ZeroVector); } +static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, + unsigned &UnpackOpcode, bool IsUnary, + ArrayRef<int> TargetMask, SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + int NumElts = VT.getVectorNumElements(); + + bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; + for (int i = 0; i != NumElts; i += 2) { + int M1 = TargetMask[i + 0]; + int M2 = TargetMask[i + 1]; + Undef1 &= (SM_SentinelUndef == M1); + Undef2 &= (SM_SentinelUndef == M2); + Zero1 &= isUndefOrZero(M1); + Zero2 &= isUndefOrZero(M2); + } + assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && + "Zeroable shuffle detected"); + + // Attempt to match the target mask against the unpack lo/hi mask patterns. + SmallVector<int, 64> Unpckl, Unpckh; + createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + // If an unary shuffle, attempt to match as an unpack lo/hi with zero. + if (IsUnary && (Zero1 || Zero2)) { + // Don't bother if we can blend instead. + if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && + isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) + return false; + + bool MatchLo = true, MatchHi = true; + for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { + int M = TargetMask[i]; + + // Ignore if the input is known to be zero or the index is undef. + if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || + (M == SM_SentinelUndef)) + continue; + + MatchLo &= (M == Unpckl[i]); + MatchHi &= (M == Unpckh[i]); + } + + if (MatchLo || MatchHi) { + UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + return true; + } + } + + // If a binary shuffle, commute and try again. + if (!IsUnary) { + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + std::swap(V1, V2); + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + std::swap(V1, V2); + return true; + } + } + + return false; +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -8248,13 +8594,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -8286,10 +8631,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); - int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) @@ -8307,51 +8650,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::OR, DL, VT, V1, V2); } -/// \brief Try to emit a blend instruction for a shuffle. -/// -/// This doesn't do any checks for the availability of instructions for blending -/// these values. It relies on the availability of the X86ISD::BLENDI pattern to -/// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Original, - const SmallBitVector &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - SmallVector<int, 8> Mask(Original.begin(), Original.end()); - bool ForceV1Zero = false, ForceV2Zero = false; +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG); + +static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, + MutableArrayRef<int> TargetMask, + bool &ForceV1Zero, bool &ForceV2Zero, + uint64_t &BlendMask) { + bool V1IsZeroOrUndef = + V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZeroOrUndef = + V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); + + BlendMask = 0; + ForceV1Zero = false, ForceV2Zero = false; + assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - unsigned BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - if (M < 0) + for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { + int M = TargetMask[i]; + if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { - BlendMask |= 1u << i; + BlendMask |= 1ull << i; continue; } - if (Zeroable[i]) { - if (V1IsZero) { + if (M == SM_SentinelZero) { + if (V1IsZeroOrUndef) { ForceV1Zero = true; - Mask[i] = i; + TargetMask[i] = i; continue; } - if (V2IsZero) { + if (V2IsZeroOrUndef) { ForceV2Zero = true; - BlendMask |= 1u << i; - Mask[i] = i + Size; + BlendMask |= 1ull << i; + TargetMask[i] = i + Size; continue; } } - return SDValue(); // Shuffled input! + return false; } + return true; +} + +uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { + uint64_t ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1ull << i)) + ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); + return ScaledMask; +} + +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is a blend, or convertible into a blend with zero. +static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); + + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + BlendMask)) + return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) @@ -8359,15 +8732,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); - auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { - unsigned ScaledMask = 0; - for (int i = 0; i != Size; ++i) - if (BlendMask & (1u << i)) - for (int j = 0; j != Scale; ++j) - ScaledMask |= 1u << (i * Scale + j); - return ScaledMask; - }; - switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -8387,7 +8751,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -8400,7 +8764,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -8417,7 +8781,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) - BlendMask |= 1u << i; + BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } @@ -8428,6 +8792,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + if (Subtarget.hasBWI() && Subtarget.hasVLX()) { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); + } + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) @@ -8465,7 +8836,17 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } - + case MVT::v16f32: + case MVT::v8f64: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); + } default: llvm_unreachable("Not a supported integer vector type!"); } @@ -8503,7 +8884,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } -/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// \brief Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined @@ -8757,7 +9138,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef<int> Mask, int MaskOffset, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -8819,7 +9200,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8855,12 +9236,12 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) @@ -8987,7 +9368,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and -/// begin and can start from an offseted element index in the input; to +/// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. @@ -9027,21 +9408,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { - // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - - // For 256-bit vectors, we only need the lower (128-bit) input half. - // For 512-bit vectors, we only need the lower input half or quarter. - if (VT.getSizeInBits() > 128) - InputV = extractSubVector(InputV, 0, DAG, DL, - std::max(128, (int)VT.getSizeInBits() / Scale)); - - InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); + InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -9158,7 +9532,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; @@ -9314,7 +9688,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); @@ -9612,7 +9986,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (((BroadcastIdx * EltSize) % 128) != 0) return SDValue(); - MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); + // The shuffle input might have been a bitcast we looked through; look at + // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll + // later bitcast it to BroadcastVT. + MVT SrcVT = V.getSimpleValueType(); + assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + "Unexpected vector element size"); + assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) && + "Unexpected vector size"); + + MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize); V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, DAG.getIntPtrConstant(BroadcastIdx, DL)); } @@ -9642,6 +10025,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } + // We only support broadcasting from 128-bit vectors to minimize the + // number of patterns we need to deal with in isel. So extract down to + // 128-bits. + if (SrcVT.getSizeInBits() > 128) + V = extract128BitVector(V, 0, DAG, DL); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -9653,7 +10042,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // elements are zeroable. static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); @@ -9742,7 +10131,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); @@ -9877,7 +10266,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9959,7 +10348,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10178,7 +10567,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10261,7 +10650,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10353,7 +10742,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build - // up the inputs, bypassing domain shift penalties that we would encur if we + // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); @@ -10384,18 +10773,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); - assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); + assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); SmallVector<int, 4> LoInputs; - std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0; }); + copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 0; }); + copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = @@ -10574,7 +10961,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); - else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from @@ -10830,7 +11217,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse, + const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; @@ -10891,7 +11278,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11075,7 +11462,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11132,14 +11519,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector<int, 4> LoInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0 && M < 8; }); + copy_if(Mask, std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 8; }); + copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); @@ -11193,7 +11579,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && - "Conflicting entrties in the original shuffle!"); + "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, @@ -11365,7 +11751,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { @@ -11621,7 +12007,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<int, 4> WidenedMask; @@ -12091,7 +12477,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, unsigned &ShuffleImm, ArrayRef<int> Mask) { int NumElts = VT.getVectorNumElements(); - assert(VT.getScalarType() == MVT::f64 && + assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); @@ -12127,6 +12513,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& + "Unexpected data type for VSHUFPD"); + unsigned Immediate = 0; if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); @@ -12153,7 +12542,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12250,7 +12639,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12338,7 +12727,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12414,6 +12803,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code using vpunpcklwd and + // vpunpckhwd instrs than vblend. + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) @@ -12429,7 +12826,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12445,6 +12842,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code than vblend by using + // vpunpcklwd and vpunpckhwd instrs. + if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && + !Subtarget.hasAVX512()) + if (SDValue V = + lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; @@ -12533,7 +12939,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12619,7 +13025,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12692,7 +13098,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we @@ -12844,7 +13250,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12891,12 +13297,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, +static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12925,6 +13335,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12938,7 +13352,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12994,12 +13408,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13062,12 +13480,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13109,12 +13530,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13159,6 +13584,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -13170,7 +13599,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && @@ -13251,7 +13680,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ISD::isBuildVectorAllZeros(V1.getNode())) V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V1.getNode())) - V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V1 = getOnesVector(ExtVT, DAG, DL); else V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); @@ -13260,7 +13689,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, else if (ISD::isBuildVectorAllZeros(V2.getNode())) V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V2.getNode())) - V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V2 = getOnesVector(ExtVT, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); @@ -13392,8 +13821,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - if (Zeroable.all()) + APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but @@ -13569,10 +13998,14 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, - // extend vector to VR512 + // extend vector to VR512/128 if (!isa<ConstantSDNode>(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + unsigned NumElts = VecVT.getVectorNumElements(); + // Extending v8i1/v16i1 to 512-bit get better performance on KNL + // than extending to 128/256bit. + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); @@ -13590,9 +14023,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned MaxSift = VecVT.getVectorNumElements() - 1; if (MaxSift - IdxVal) - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, DAG.getIntPtrConstant(0, dl)); @@ -13610,24 +14043,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return ExtractBitFromMaskVector(Op, DAG); if (!isa<ConstantSDNode>(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { - - MVT MaskEltVT = - MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); + // Its more profitable to go through memory (1 cycles throughput) + // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) + // IACA tool was used to get performance estimation + // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) + // + // example : extractelement <16 x i8> %a, i32 %i + // + // Block Throughput: 3.00 Cycles + // Throughput Bottleneck: Port5 + // + // | Num Of | Ports pressure in cycles | | + // | Uops | 0 - DV | 5 | 6 | 7 | | + // --------------------------------------------- + // | 1 | | 1.0 | | | CP | vmovd xmm1, edi + // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 + // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 + // Total Num Of Uops: 4 + // + // + // Block Throughput: 1.00 Cycles + // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 + // + // | | Ports pressure in cycles | | + // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | + // --------------------------------------------------------- + // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 + // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] + // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] + // Total Num Of Uops: 4 - Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, - DAG.getConstant(0, dl, PtrVT)); - } return SDValue(); } @@ -13675,7 +14120,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: handle v16i8. + // TODO: We only extract a single element from v16i8, we can probably afford + // to be more aggressive here before using the default approach of spilling to + // stack. + if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { + // Extract either the lowest i32 or any i16, and extract the sub-byte. + int DWordIdx = IdxVal / 4; + if (DWordIdx == 0) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + DAG.getIntPtrConstant(DWordIdx, dl)); + int ShiftVal = (IdxVal % 4) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, + DAG.getConstant(ShiftVal, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + int WordIdx = IdxVal / 2; + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) @@ -13734,7 +14205,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if(Vec.isUndef()) { if (IdxVal) - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return EltInVec; } @@ -13744,21 +14215,21 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if (IdxVal == 0 ) { // EltInVec already at correct index and other bits are 0. // Clean the first bit in source vector. - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); @@ -13790,17 +14261,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, auto *N2C = cast<ConstantSDNode>(N2); unsigned IdxVal = N2C->getZExtValue(); - // If we are clearing out a element, we do this more efficiently with a - // blend shuffle than a costly integer insertion. - // TODO: would other rematerializable values (e.g. allbits) benefit as well? + bool IsZeroElt = X86::isZeroNode(N1); + bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); + + // If we are inserting a element, see if we can do this more efficiently with + // a blend shuffle with a rematerializable vector than a costly integer + // insertion. // TODO: pre-SSE41 targets will tend to use bit masking - this could still // be beneficial if we are inserting several zeros and can combine the masks. - if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { - SmallVector<int, 8> ClearMask; + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) { + SmallVector<int, 8> BlendMask; for (unsigned i = 0; i != NumElts; ++i) - ClearMask.push_back(i == IdxVal ? i + NumElts : i); - SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) + : DAG.getConstant(-1, dl, VT); + return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert @@ -13837,25 +14312,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget.hasSSE41()) { - if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { - unsigned Opc; - if (VT == MVT::v8i16) { - Opc = X86ISD::PINSRW; - } else { - assert(VT == MVT::v16i8); - Opc = X86ISD::PINSRB; - } - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. SSE41 required for pinsrb. + if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { + unsigned Opc; + if (VT == MVT::v8i16) { + assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); + assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); + Opc = X86ISD::PINSRB; } + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into @@ -13885,36 +14362,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } - if (EltVT == MVT::i32 || EltVT == MVT::i64) { - // PINSR* works with constant index. + // PINSR* works with constant index. + if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; - } } - if (EltVT == MVT::i8) - return SDValue(); - - if (EltVT.getSizeInBits() == 16) { - // Transform it so it match pinsrw which expects a 16-bit value in a GR32 - // as its second argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); - } return SDValue(); } -static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); + // It's always cheaper to replace a xor+movd with xorps and simplifies further + // combines. + if (X86::isZeroNode(Op.getOperand(0))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. - unsigned SizeFactor = OpVT.getSizeInBits()/128; + unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); @@ -13923,9 +14393,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + + // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. + if (OpVT == MVT::v4i32) + return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } @@ -13947,20 +14421,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, In.getSimpleValueType().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors"); - if (ResVT.is128BitVector()) - return extract128BitVector(In, IdxVal, DAG, dl); - if (ResVT.is256BitVector()) - return extract256BitVector(In, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); -} + // If the input is a buildvector just emit a smaller one. + unsigned ElemsPerChunk = ResVT.getVectorNumElements(); + if (In.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT, + makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); -static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) - if (llvm::all_of(ValidUsers, - [&I](SDValue V) { return V.getNode() != *I; })) - return false; - return true; + // Everything else is legal. + return Op; } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a @@ -13968,83 +14436,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX"); - - SDLoc dl(Op); - SDValue Vec = Op.getOperand(0); - SDValue SubVec = Op.getOperand(1); - SDValue Idx = Op.getOperand(2); - - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT OpVT = Op.getSimpleValueType(); - MVT SubVecVT = SubVec.getSimpleValueType(); - - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); - - assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && - "Can only insert into 256-bit or 512-bit vectors"); + assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) - if ((IdxVal == OpVT.getVectorNumElements() / 2) && - Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); - if (Idx2 && Idx2->getZExtValue() == 0) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; - } - } - // If lower/upper loads are the same and the only users of the load, then - // lower to a VBROADCASTF128/VBROADCASTI128/etc. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && - areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - } - } - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - } - } - } - - if (SubVecVT.is128BitVector()) - return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - - if (SubVecVT.is256BitVector()) - return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); + return insert1BitVector(Op, DAG, Subtarget); } // Returns the appropriate wrapper opcode for a global reference. @@ -14062,7 +14456,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as -// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected @@ -14438,7 +14832,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget.isTargetWindowsItanium() || Subtarget.isTargetWindowsGNU()) { // Just use the implicit TLS architecture - // Need to generate someting similar to: + // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) @@ -15489,32 +15883,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, - DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } - // Truncate with PACKSS if we are truncating a vector comparison result. - // TODO: We should be able to support other operations as long as we - // we are saturating+packing zero/all bits only. - auto IsPackableComparison = [](SDValue V) { - unsigned Opcode = V.getOpcode(); - return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ || - Opcode == X86ISD::CMPP); - }; - - if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS && - all_of(In->ops(), IsPackableComparison))) { + // Truncate with PACKSS if we are truncating a vector zero/all-bits result. + if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget)) return V; - } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); - In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), - ShufMask); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } @@ -15530,30 +15913,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { - // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); - SmallVector<SDValue,32> pshufbMask; - for (unsigned i = 0; i < 2; ++i) { - pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); - for (unsigned j = 0; j < 8; ++j) - pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); - } - SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask); - In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + // The PSHUFB mask: + static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1, + 16, 17, 20, 21, 24, 25, 28, 29, + -1, -1, -1, -1, -1, -1, -1, -1 }; + In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); - static const int ShufMask[] = {0, 2, -1, -1}; - In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), - ShufMask); + static const int ShufMask2[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); @@ -15572,9 +15945,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; - SDValue Undef = DAG.getUNDEF(MVT::v16i8); - OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); @@ -15598,17 +15970,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; - SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), - DAG.getUNDEF(NVT), MaskVec); + In = DAG.getBitcast(NVT, In); + SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } -SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { @@ -15616,8 +15985,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SDValue Src = Op.getOperand(0); SDLoc dl(Op); if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, - dl, VT, + return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); } @@ -15891,7 +16259,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); - // If more than one full vectors are evaluated, OR them first before PTEST. + // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. @@ -15900,8 +16268,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. @@ -16366,7 +16733,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, } /// If we have at least two divisions that use the same divisor, convert to -/// multplication by a reciprocal. This may need to be adjusted for a given +/// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the @@ -17241,12 +17608,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y + // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - - unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + unsigned CondCode = + cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { @@ -17283,6 +17652,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; + } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && + Cmp.getOperand(0).getOpcode() == ISD::AND && + isOneConstant(Cmp.getOperand(0).getOperand(1))) { + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue Src1, Src2; + // true if Op2 is XOR or OR operator and one of its operands + // is equal to Op1 + // ( a , a op b) || ( b , a op b) + auto isOrXorPattern = [&]() { + if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && + (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { + Src1 = + Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); + Src2 = Op1; + return true; + } + return false; + }; + + if (isOrXorPattern()) { + SDValue Neg; + unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); + // we need mask of all zeros or ones with same size of the other + // operands. + if (CmpSz > VT.getSizeInBits()) + Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); + else if (CmpSz < VT.getSizeInBits()) + Neg = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), + DAG.getConstant(1, DL, VT)); + else + Neg = CmpOp0; + SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Neg); // -(and (x, 0x1)) + SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z + return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y + } } } @@ -17423,17 +17829,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || - - ((Subtarget.hasBWI() && VT.is512BitVector() && - VTElt.getSizeInBits() <= 16)) || + (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget.hasDQI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - ((Subtarget.hasDQI() && VT.is512BitVector() && - VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned NumElts = VT.getVectorNumElements(); @@ -17441,8 +17840,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, if (VT.is512BitVector() && InVTElt != MVT::i1 && (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) - return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG); + return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG); } if (InVTElt != MVT::i1) @@ -17454,10 +17853,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SDValue V; if (Subtarget.hasDQI()) { - V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); assert(!VT.is512BitVector() && "Unexpected vector type"); } else { - SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); + SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) @@ -17506,11 +17905,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); - // SSE41 targets can use the pmovsx* instructions directly. - unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? - X86ISD::VSEXT : X86ISD::VZEXT; - if (Subtarget.hasSSE41()) + // SSE41 targets can use the pmovsx* instructions directly for 128-bit results, + // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still + // need to be handled here for 256/512-bit results. + if (Subtarget.hasInt256()) { + assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); + } // We should only get here for sign extend. assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && @@ -17595,8 +17998,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); + OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT); + OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } @@ -17674,7 +18077,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, MVT VT = Op.getValueType().getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); - if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + if ((Subtarget.hasBWI() && NumElts >= 32) || + (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { @@ -17703,7 +18107,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, if (NumElts <= 8) { // A subset, assume that we have only AVX-512F - unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), @@ -17911,7 +18315,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget.hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } @@ -18469,6 +18873,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Bitcast the source vector to the output type, this is mainly necessary for + // vXi8/vXi64 shifts. + if (VT != SrcOp.getSimpleValueType()) + SrcOp = DAG.getBitcast(VT, SrcOp); + // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; @@ -18485,9 +18894,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a - // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. - if (VT == SrcOp.getSimpleValueType() && - ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + // vector of Constants or UNDEFs. + if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector<SDValue, 8> Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; @@ -18578,11 +18986,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { ShAmt = ShAmt.getOperand(0); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -18853,6 +19261,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, VT, Src1, Src2, Rnd), + Mask, passThru, Subtarget, DAG); + } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } @@ -19306,6 +19722,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src2, Src1); return DAG.getBitcast(VT, Res); } + case MASK_BINOP: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2); + return DAG.getBitcast(VT, Res); + } case FIXUPIMMS: case FIXUPIMMS_MASKZ: case FIXUPIMM: @@ -19478,6 +19903,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_knot_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kandn_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + // Invert LHS for the not. + LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, + DAG.getConstant(1, dl, MVT::v16i1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kxnor_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + // Invert result for the not. + Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res, + DAG.getConstant(1, dl, MVT::v16i1)); + return DAG.getBitcast(MVT::i16, Res); + } + case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -19603,6 +20055,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget } } +static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { + SDLoc dl(Op); + auto *C = cast<ConstantSDNode>(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + EVT MaskVT = Mask.getValueType(); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, dl); +} + static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, @@ -19617,7 +20091,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - if (Src.isUndef()) + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -19656,7 +20133,6 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); @@ -19928,6 +20404,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + case GATHER_AVX2: { + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain, Subtarget); + } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); @@ -19953,8 +20439,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); - assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); + assert((HintVal == 2 || HintVal == 3) && + "Wrong prefetch hint in intrinsic: should be 2 or 3"); + unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -20368,7 +20855,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); - const AttributeSet &Attrs = Func->getAttributes(); + const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; @@ -20802,9 +21289,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), +static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getScalarType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && @@ -20812,14 +21300,23 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } -static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Op.getOperand(1)); +static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - return Lower256IntArith(Op, DAG); + MVT VT = Op.getSimpleValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + SDValue Lo = extract128BitVector(Src, 0, DAG, dl); + SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(ISD::ABS, dl, NewVT, Lo), + DAG.getNode(ISD::ABS, dl, NewVT, Hi)); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { @@ -20834,7 +21331,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - if (VT == MVT::i1) + if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. @@ -20874,8 +21371,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); - BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); + BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -20894,8 +21391,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); - BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); + BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21056,8 +21553,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } - SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); - SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); + SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG); + SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, DAG.getConstant(8, dl, MVT::v16i16)); @@ -21073,8 +21570,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(ExSSE41, dl, ExVT, A); - BLo = DAG.getNode(ExSSE41, dl, ExVT, B); + ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG); + BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -21093,8 +21590,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); - BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG); + BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21148,8 +21645,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); } @@ -21157,11 +21654,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), - static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, std::move(Args)) - .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee( + getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, + std::move(Args)) + .setInRegister() + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); @@ -21269,15 +21770,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() < 16) return false; - if (VT.is512BitVector() && + if (VT.is512BitVector() && Subtarget.hasAVX512() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; - bool LShift = VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256()); + bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()); - bool AShift = LShift && (Subtarget.hasVLX() || - (VT != MVT::v2i64 && VT != MVT::v4i64)); + bool AShift = LShift && (Subtarget.hasAVX512() || + (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -21301,7 +21802,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (VT.is512BitVector() || Subtarget.hasVLX()) + if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); @@ -22062,10 +22563,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::INC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; @@ -22077,10 +22578,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::DEC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; @@ -22470,7 +22971,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and - // masked out higher ones) for each byte. PSHUB is used separately with both + // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. // @@ -22867,8 +23368,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; @@ -22885,8 +23386,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -23086,7 +23588,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23142,7 +23644,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23202,7 +23704,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); - // The pass-thru value + // The pass-through value MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src0 = ExtendToType(Src0, NewVT, DAG); @@ -23284,7 +23786,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); - case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); @@ -23303,7 +23805,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: @@ -23360,12 +23862,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); - case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::ADD: + case ISD::SUB: return LowerADD_SUB(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); @@ -23768,7 +24271,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; - case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; @@ -23779,16 +24281,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMAXS: return "X86ISD::FMAXS"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMINS: return "X86ISD::FMINS"; case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; + case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; + case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; @@ -23827,7 +24332,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; @@ -23876,6 +24380,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; + case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -23976,9 +24482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; @@ -24302,7 +24812,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24338,7 +24848,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24398,7 +24908,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) @@ -24413,6 +24923,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + // Address into RAX/EAX + unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.add(MI->getOperand(i)); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + + + MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -24536,12 +25066,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -24561,12 +25091,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 16) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 16) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -24588,13 +25118,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .addReg(NextOffsetReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -24608,12 +25138,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -24644,13 +25174,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .addReg(NextAddrReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -24867,7 +25397,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, // // (CMOV (CMOV F, T, cc1), T, cc2) // - // to two successives branches. For that, we look for another CMOV as the + // to two successive branches. For that, we look for another CMOV as the // following instruction. // // Without this, we would add a PHI between the two jumps, which ends up @@ -25123,12 +25653,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, // instruction using the same address operands. if (Operand.isReg()) Operand.setIsKill(false); - MIB.addOperand(Operand); + MIB.add(Operand); } MachineInstr *FOpMI = MIB; MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -25508,7 +26038,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else - MIB.addOperand(MI.getOperand(MemOpndSlot + i)); + MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); @@ -25591,7 +26121,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); @@ -25599,7 +26129,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), LabelOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP @@ -25608,7 +26138,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump @@ -25625,7 +26155,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -25644,8 +26174,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; - /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */ - if (Subtarget.is64Bit()) BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) @@ -25655,7 +26183,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, .addReg(0); else BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) - .addReg(0) /* XII->getGlobalBaseReg(MF) */ + .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) @@ -25677,7 +26205,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're @@ -25749,9 +26277,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); unsigned MJTI = JTI->createJumpTableIndex(LPadList); - const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); - const X86RegisterInfo &RI = XII->getRegisterInfo(); - + const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { @@ -25799,8 +26325,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector<MachineBasicBlock *, 64> MBBLPads; - const MCPhysReg *SavedRegs = - Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF); + const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. @@ -26033,6 +26558,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); case X86::MONITORX: return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); + + // Cache line zero + case X86::CLZERO: + return emitClzero(&MI, BB, Subtarget); + // PKU feature case X86::WRPKRU: return emitWRPKRU(MI, BB, Subtarget); @@ -26137,10 +26667,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || @@ -26167,44 +26699,91 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; LLVM_FALLTHROUGH; case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + KnownZero.setBits(1, BitWidth); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); + KnownZero.setBits(NumLoBits, BitWidth); + break; + } + case X86ISD::VSHLI: + case X86ISD::VSRLI: { + if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { + KnownZero = APInt::getAllOnesValue(BitWidth); + break; + } + + DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1); + unsigned ShAmt = ShiftImm->getZExtValue(); + if (Opc == X86ISD::VSHLI) { + KnownZero = KnownZero << ShAmt; + KnownOne = KnownOne << ShAmt; + // Low bits are known zero. + KnownZero.setLowBits(ShAmt); + } else { + KnownZero = KnownZero.lshr(ShAmt); + KnownOne = KnownOne.lshr(ShAmt); + // High bits are known zero. + KnownZero.setHighBits(ShAmt); + } + } break; } case X86ISD::VZEXT: { SDValue N0 = Op.getOperand(0); - unsigned NumElts = Op.getValueType().getVectorNumElements(); - unsigned InNumElts = N0.getValueType().getVectorNumElements(); - unsigned InBitWidth = N0.getValueType().getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + + EVT SrcVT = N0.getValueType(); + unsigned InNumElts = SrcVT.getVectorNumElements(); + unsigned InBitWidth = SrcVT.getScalarSizeInBits(); + assert(InNumElts >= NumElts && "Illegal VZEXT input"); KnownZero = KnownOne = APInt(InBitWidth, 0); - APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); + APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); + DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1); KnownOne = KnownOne.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth); + KnownZero.setBits(InBitWidth, BitWidth); break; } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, const SelectionDAG &DAG, unsigned Depth) const { - // SETCC_CARRY sets the dest to ~0 for true or 0 for false. - if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getScalarValueSizeInBits(); + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { + unsigned VTBits = Op.getScalarValueSizeInBits(); + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case X86ISD::SETCC_CARRY: + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + return VTBits; - if (Op.getOpcode() == X86ISD::VSEXT) { - EVT VT = Op.getValueType(); - EVT SrcVT = Op.getOperand(0).getValueType(); - unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits(); + case X86ISD::VSEXT: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + Tmp += VTBits - Src.getScalarValueSizeInBits(); return Tmp; } + case X86ISD::VSRAI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + ShiftVal += Tmp; + return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); + } + + case X86ISD::PCMPGT: + case X86ISD::PCMPEQ: + case X86ISD::CMPP: + case X86ISD::VPCOM: + case X86ISD::VPCOMU: + // Vector compares return zero/all-bits result values. + return VTBits; + } + // Fallback case. return 1; } @@ -26228,24 +26807,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). - if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && - isUndefOrEqual(Mask[0], 0) && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - - // Match against a VZEXT instruction. - // TODO: Add 256/512-bit vector support. - if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { + // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool Match = true; @@ -26255,19 +26827,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } if (Match) { - SrcVT = MaskVT; + unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); + SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize); + if (SrcVT != MaskVT) + V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = X86ISD::VZEXT; + Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT) + : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } } + // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + isUndefOrEqual(Mask[0], 0) && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; @@ -26285,7 +26870,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is256BitVector() && FloatDomain) { + if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; @@ -26304,7 +26889,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is512BitVector() && FloatDomain) { + if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { @@ -26343,24 +26928,26 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + bool AllowFloatDomain, + bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); bool ContainsZeros = false; - SmallBitVector Zeroable(NumMaskElts, false); + APInt Zeroable(NumMaskElts, false); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; - Zeroable[i] = isUndefOrZero(M); + if (isUndefOrZero(M)) + Zeroable.setBit(i); ContainsZeros |= (M == SM_SentinelZero); } // Attempt to match against byte/bit shifts. // FIXME: Add 512-bit support. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, MaskVT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); @@ -26423,19 +27010,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if (FloatDomain && !Subtarget.hasAVX()) + if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) return false; // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) - FloatDomain = true; + if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) { + AllowFloatDomain = true; + AllowIntDomain = false; + } // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } @@ -26443,7 +27032,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, SmallVector<int, 4> RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } @@ -26452,7 +27041,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // VPERMILPD can permute with a non-repeating shuffle. - if (FloatDomain && MaskScalarSizeInBits == 64) { + if (AllowFloatDomain && MaskScalarSizeInBits == 64) { Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; @@ -26476,8 +27065,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskScalarSizeInBits == 64) scaleShuffleMask(2, RepeatedMask, WordMask); - Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; @@ -26487,34 +27076,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, SDValue &V1, SDValue &V2, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; ShuffleVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; ShuffleVT = MaskVT; return true; @@ -26527,57 +27118,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { - MVT LegalVT = MaskVT; - if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) - LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); - - SmallVector<int, 64> Unpckl, Unpckh; - if (IsUnary) { - createUnpackShuffleMask(MaskVT, Unpckl, true, true); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, true); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - } else { - createUnpackShuffleMask(MaskVT, Unpckl, true, false); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, false); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } + if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, + DAG, Subtarget)) { + ShuffleVT = MaskVT; + if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2()) + ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + return true; } } @@ -26585,17 +27131,19 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, - SDValue &V1, SDValue &V2, - SDLoc &DL, SelectionDAG &DAG, + bool AllowFloatDomain, + bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against PALIGNR byte rotate. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; @@ -26606,77 +27154,74 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to X86ISD::BLENDI. - if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || - (Subtarget.hasAVX() && MaskVT.is256BitVector()))) { - // Determine a type compatible with X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - MVT BlendVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (BlendVT == MVT::v4i64) - BlendVT = MVT::v8i32; - else if (BlendVT == MVT::v2i64) - BlendVT = MVT::v4i32; - } else { - if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32) - BlendVT = MVT::v8i16; - else if (BlendVT == MVT::v4i64) - BlendVT = MVT::v4f64; - else if (BlendVT == MVT::v8i32) - BlendVT = MVT::v8f32; - } - - unsigned BlendSize = BlendVT.getVectorNumElements(); - unsigned MaskRatio = BlendSize / NumMaskElts; - - // Can we blend with zero? - if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, - /*Low*/ 0) && - NumMaskElts <= BlendVT.getVectorNumElements()) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if (Mask[i / MaskRatio] < 0) - PermuteImm |= 1u << i; - - V2 = getZeroVector(BlendVT, Subtarget, DAG, DL); - Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; - return true; - } - - // Attempt to match as a binary blend. - if (NumMaskElts <= BlendVT.getVectorNumElements()) { - bool MatchBlend = true; - for (int i = 0; i != (int)NumMaskElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - else if (M == SM_SentinelZero) - MatchBlend = false; - else if ((M != i) && (M != (i + (int)NumMaskElts))) - MatchBlend = false; - } + if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || + (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || + (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, + BlendMask)) { + if (MaskVT == MVT::v16i16) { + // We can only use v16i16 PBLENDW if the lanes are repeated. + SmallVector<int, 8> RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, + RepeatedMask)) { + assert(RepeatedMask.size() == 8 && + "Repeated mask size doesn't match!"); + PermuteImm = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 8) + PermuteImm |= 1 << i; + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; + return true; + } + } else { + // Determine a type compatible with X86ISD::BLENDI. + ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } - if (MatchBlend) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if ((int)NumMaskElts <= Mask[i / MaskRatio]) - PermuteImm |= 1u << i; + if (!ShuffleVT.isFloatingPoint()) { + int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); + BlendMask = + scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); + ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + } + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; return true; } } } // Attempt to combine to INSERTPS. - if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) { - SmallBitVector Zeroable(4, false); + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector()) { + APInt Zeroable(4, 0); for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] < 0) - Zeroable[i] = true; + Zeroable.setBit(i); - if (Zeroable.any() && + if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; @@ -26685,22 +27230,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to SHUFPD. - if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) || - (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) || - (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 64 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. - if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || - (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || - (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 32 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector<int, 4> RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + // Match each half of the repeated mask, to determine if its just + // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; @@ -26732,7 +27281,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } @@ -26764,7 +27313,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); - SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1])); + SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) + : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); @@ -26853,6 +27403,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; + // Which shuffle domains are permitted? + // Permit domain crossing at higher combine depths. + bool AllowFloatDomain = FloatDomain || (Depth > 3); + bool AllowIntDomain = !FloatDomain || (Depth > 3); + if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper @@ -26869,8 +27424,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26884,8 +27440,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, - Shuffle, ShuffleVT, PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, + ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26901,8 +27458,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget, - Shuffle, ShuffleVT, UnaryShuffle)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, + UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26918,8 +27476,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, - DAG, Subtarget, Shuffle, ShuffleVT, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + AllowIntDomain, V1, V2, DL, DAG, + Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! @@ -27039,12 +27598,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); - SmallBitVector UndefElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); SmallVector<APInt, 64> EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) @@ -27228,8 +27787,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, // Extract constant bits from each source op. bool OneUseConstantOp = false; - SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps); - SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps); + SmallVector<APInt, 16> UndefEltsOps(NumOps); + SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps); for (unsigned i = 0; i != NumOps; ++i) { SDValue SrcOp = Ops[i]; OneUseConstantOp |= SrcOp.hasOneUse(); @@ -27245,18 +27804,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, return false; // Shuffle the constant bits according to the mask. - SmallBitVector UndefElts(NumMaskElts, false); - SmallBitVector ZeroElts(NumMaskElts, false); - SmallBitVector ConstantElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); + APInt ZeroElts(NumMaskElts, 0); + APInt ConstantElts(NumMaskElts, 0); SmallVector<APInt, 8> ConstantBitData(NumMaskElts, APInt::getNullValue(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); @@ -27266,21 +27825,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } - ConstantElts[i] = true; + ConstantElts.setBit(i); ConstantBitData[i] = Bits; } - assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); // Create the constant data. MVT MaskSVT; @@ -27330,6 +27889,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, ArrayRef<int> RootMask, + ArrayRef<const SDNode*> SrcNodes, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -27353,13 +27913,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. - SDValue Input0, Input1; - SmallVector<int, 16> OpMask; - if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) + SmallVector<int, 64> OpMask; + SmallVector<SDValue, 2> OpInputs; + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask)) return false; + assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); + SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); + SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); + // Add the inputs to the Ops list, avoiding duplicates. - SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end()); + SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); int InputIdx0 = -1, InputIdx1 = -1; for (int i = 0, e = Ops.size(); i < e; ++i) { @@ -27392,8 +27956,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, (RootRatio == 1) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"); - SmallVector<int, 16> Mask; - Mask.reserve(MaskWidth); + SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the @@ -27403,7 +27966,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, int RootIdx = i / RootRatio; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. - Mask.push_back(RootMask[RootIdx]); + Mask[i] = RootMask[RootIdx]; continue; } @@ -27413,7 +27976,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { - Mask.push_back(RootMaskedIdx); + Mask[i] = RootMaskedIdx; continue; } @@ -27423,7 +27986,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. - Mask.push_back(OpMask[OpIdx]); + Mask[i] = OpMask[OpIdx]; continue; } @@ -27439,7 +28002,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, OpMaskedIdx += InputIdx1 * MaskWidth; } - Mask.push_back(OpMaskedIdx); + Mask[i] = OpMaskedIdx; } // Handle the all undef/zero cases early. @@ -27457,28 +28020,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, } // Remove unused shuffle source ops. - SmallVector<SDValue, 8> UsedOps; - for (int i = 0, e = Ops.size(); i < e; ++i) { - int lo = UsedOps.size() * MaskWidth; - int hi = lo + MaskWidth; - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedOps.push_back(Ops[i]); - continue; - } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; - } - assert(!UsedOps.empty() && "Shuffle with no inputs detected"); - Ops = UsedOps; + resolveTargetShuffleInputsAndMask(Ops, Mask); + assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into each shuffle source op (if it's a target shuffle). + // Update the list of shuffle nodes that have been combined so far. + SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), + SrcNodes.end()); + CombinedNodes.push_back(Op.getNode()); + + // See if we can recurse into each shuffle source op (if it's a target + // shuffle). The source op should only be combined if it either has a + // single use (i.e. current Op) or all its users have already been combined. for (int i = 0, e = Ops.size(); i < e; ++i) - if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) - if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) + if (Ops[i].getNode()->hasOneUse() || + SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes, + Depth + 1, HasVariableMask, DAG, DCI, + Subtarget)) return true; // Attempt to constant fold all of the constant source ops. @@ -27495,7 +28055,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - SmallVector<int, 16> WidenedMask; + SmallVector<int, 64> WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); } @@ -27561,8 +28121,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); @@ -27842,19 +28401,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } case X86ISD::MOVSD: case X86ISD::MOVSS: { - bool isFloat = VT.isFloatingPoint(); SDValue V0 = peekThroughBitcasts(N->getOperand(0)); SDValue V1 = peekThroughBitcasts(N->getOperand(1)); - bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); - bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); - assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + if (isZero0 && isZero1) + return SDValue(); // We often lower to MOVSD/MOVSS from integer as well as native float // types; remove unnecessary domain-crossing bitcasts if we can to make it // easier to combine shuffles later on. We've already accounted for the // domain switching cost when we decided to lower with it. + bool isFloat = VT.isFloatingPoint(); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); @@ -28025,7 +28585,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) return NewN; break; @@ -28173,12 +28733,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); - - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) - return SDValue(); - // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) @@ -28249,11 +28804,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) - Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + Elts.clear(); + break; + } - if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) - return LD; + if (Elts.size() == VT.getVectorNumElements()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle <mask> (concat_vectors t1, undef) @@ -28276,7 +28838,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -28303,18 +28865,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EVT OriginalVT = InVec.getValueType(); - if (InVec.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || - BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) - return SDValue(); - InVec = InVec.getOperand(0); - } + // Peek through bitcasts, don't duplicate a load with other uses. + InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); + if (!CurrentVT.isVector() || + CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -28393,19 +28950,41 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + + // Since MMX types are special and don't usually play with other vector types, + // it's better to handle them early to be sure we emit efficient code by + // avoiding store-load conversions. - // Detect bitcasts between i32 to x86mmx low word. Since MMX types are - // special and don't usually play with other vector types, it's better to - // handle them early to be sure we emit efficient code by avoiding - // store-load conversions. + // Detect bitcasts between i32 to x86mmx low word. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && - N0.getValueType() == MVT::v2i32 && - isNullConstant(N0.getOperand(1))) { + SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } + // Detect bitcasts between element or subvector extraction to x86mmx. + if (VT == MVT::x86mmx && + (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType().is128BitVector()) + return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, + DAG.getBitcast(MVT::v2i64, N00)); + } + + // Detect bitcasts from FP_TO_SINT to x86mmx. + if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 && + N0.getOpcode() == ISD::FP_TO_SINT) { + SDLoc DL(N0); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, + DAG.getBitcast(MVT::v2i64, Res)); + } + // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the @@ -28511,12 +29090,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) + if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); + // The following instructions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && @@ -28529,8 +29114,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOperand(0) != SelectOp1) return false; - // The second operand of the comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && + SplatVal == 1) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; @@ -28576,17 +29170,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE2 or with AVX512VL (which uses predicate registers). + if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match) + continue; + + // EXTRACT_VECTOR_ELT can require implicit extension of the vector element + // which we can't support here for now. + if (Match.getScalarValueSizeInBits() != BitWidth) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + return SDValue(); + + // Don't bother performing this for 2-element vectors. + if (Match.getValueType().getVectorNumElements() <= 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) + MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + // Perform the select as i32/i64 and then truncate to avoid partial register + // stalls. + unsigned ResWidth = std::max(BitWidth, 32u); + EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); + SDLoc DL(Extract); + SDValue Zero = DAG.getConstant(0, DL, ResVT); + SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + Ones, Zero, CondCode); + return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is appropriate - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. + // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; @@ -28595,15 +29264,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, else if (Subtarget.hasAVX2()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + // The operand is expected to be zero extended from i8 + // (verified in detectZextAbsDiff). + // In order to convert to i64 and above, additional any/zero/sign + // extend is expected. + // The zero extend from 32 bit has no mathematical effect on the result. + // Also the sign extend is basically zero extend + // (extends the sign bit which is zero). + // So it is correct to skip the sign/zero extend instruction. + if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) + Root = Root.getOperand(0); + // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) @@ -28614,7 +29296,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); - // Create the SAD instruction + // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); @@ -28636,13 +29318,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, } } - // Return the lowest i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + MVT Type = Extract->getSimpleValueType(0); + unsigned TypeSizeInBits = Type.getSizeInBits(); + // Return the lowest TypeSizeInBits bits. + MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } +// Attempt to peek through a target shuffle and extract the scalar from the +// source. +static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Src = N->getOperand(0); + SDValue Idx = N->getOperand(1); + + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + + // Don't attempt this for boolean mask vectors or unknown extraction indices. + if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Resolve the target shuffle inputs and mask. + SmallVector<int, 16> Mask; + SmallVector<SDValue, 2> Ops; + if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask)) + return SDValue(); + + // Attempt to narrow/widen the shuffle mask to the correct size. + if (Mask.size() != NumSrcElts) { + if ((NumSrcElts % Mask.size()) == 0) { + SmallVector<int, 16> ScaledMask; + int Scale = NumSrcElts / Mask.size(); + scaleShuffleMask(Scale, Mask, ScaledMask); + Mask = std::move(ScaledMask); + } else if ((Mask.size() % NumSrcElts) == 0) { + SmallVector<int, 16> WidenedMask; + while (Mask.size() > NumSrcElts && + canWidenShuffleElements(Mask, WidenedMask)) + Mask = std::move(WidenedMask); + // TODO - investigate support for wider shuffle masks with known upper + // undef/zero elements for implicit zero-extension. + } + } + + // Check if narrowing/widening failed. + if (Mask.size() != NumSrcElts) + return SDValue(); + + int SrcIdx = Mask[N->getConstantOperandVal(1)]; + SDLoc dl(N); + + // If the shuffle source element is undef/zero then we can just accept it. + if (SrcIdx == SM_SentinelUndef) + return DAG.getUNDEF(VT); + + if (SrcIdx == SM_SentinelZero) + return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) + : DAG.getConstant(0, dl, VT); + + SDValue SrcOp = Ops[SrcIdx / Mask.size()]; + SrcOp = DAG.getBitcast(SrcVT, SrcOp); + SrcIdx = SrcIdx % Mask.size(); + + // We can only extract other elements from 128-bit vectors and in certain + // circumstances, depending on SSE-level. + // TODO: Investigate using extract_subvector for larger vectors. + // TODO: Investigate float/double extraction if it will be just stored. + if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && + ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { + assert(SrcSVT == VT && "Unexpected extraction type"); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + } + + if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || + (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { + assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && + "Unexpected extraction type"); + unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp, + DAG.getValueType(SrcSVT)); + return DAG.getZExtOrTrunc(Assert, dl, VT); + } + + return SDValue(); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -28653,14 +29425,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); SDLoc dl(InputVector); + + // Detect mmx extraction of all bits as a i64. It works better as a bitcast. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { + SDValue MMXSrc = InputVector.getOperand(0); + + // The bitcast source is a direct mmx result. + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getBitcast(VT, InputVector); + } + // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && - N->getValueType(0) == MVT::i32 && - InputVector.getValueType() == MVT::v2i32 && - isa<ConstantSDNode>(N->getOperand(1)) && - N->getConstantOperandVal(1) == 0) { + VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. @@ -28668,15 +29455,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - EVT VT = N->getValueType(0); - - if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && - InputVector.getOpcode() == ISD::BITCAST && + if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && + isa<ConstantSDNode>(EltIdx) && isa<ConstantSDNode>(InputVector.getOperand(0))) { - uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + uint64_t ExtractedElt = N->getConstantOperandVal(1); + uint64_t InputValue = InputVector.getConstantOperandVal(0); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -28687,9 +29470,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. - if (InputVector.getValueType() != MVT::v4i32) + if (SrcVT != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a @@ -28717,9 +29504,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); // Record which element was extracted. - ExtractedElements |= - 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); - + ExtractedElements |= 1 << Extract->getConstantOperandVal(1); Uses.push_back(Extract); } @@ -28752,11 +29537,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue StackPtr = DAG.CreateStackTemporary(SrcVT); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo()); - EVT ElementType = InputVector.getValueType().getVectorElementType(); + EVT ElementType = SrcVT.getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. @@ -28779,8 +29564,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - SDValue Idx = Extract->getOperand(1); - uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + uint64_t IdxVal = Extract->getConstantOperandVal(1); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } @@ -28788,6 +29572,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// TODO - merge with combineExtractVectorElt once it can handle the implicit +// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: +// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and +// combineBasicSADPattern. +static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + return combineExtractWithShuffle(N, DAG, DCI, Subtarget); +} + /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. static SDValue @@ -28812,12 +29606,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // This situation only applies to avx512. if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { - //Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), - DL, CondVT)); - //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + // Invert the cond to not(cond) : xor(op,allones)=not(op) + SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getAllOnesConstant(DL, CondVT)); + // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 + return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -28920,18 +29713,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(ShAmt, DL, MVT::i8)); } - // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. - if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); - return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - } - // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { @@ -29049,7 +29830,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29063,7 +29844,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op1.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, Op1, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); return true; } case ISD::EXTRACT_SUBVECTOR: { @@ -29072,7 +29853,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29084,7 +29865,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op0.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); + return true; + } + case X86ISD::SUBV_BROADCAST: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = MVT::getVectorVT(EltVT, + Op0.getSimpleValueType().getSizeInBits() / EltSize); + Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0)); + DCI.AddToWorklist(Op0.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0)); return true; } } @@ -29370,8 +30167,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the - // condition so that the blends can use the high bit of each element and use - // SimplifyDemandedBits to simplify the condition operand. + // condition so that blends can use the high (sign) bit of each element and + // use SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { @@ -29406,49 +30203,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); - APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); - + APInt DemandedMask(APInt::getSignBit(BitWidth)); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { - // If we changed the computation somewhere in the DAG, this change - // will affect all users of Cond. - // Make sure it is fine and update all the nodes so that we do not - // use the generic VSELECT anymore. Otherwise, we may perform - // wrong optimizations as we messed up with the actual expectation + // If we changed the computation somewhere in the DAG, this change will + // affect all users of Cond. Make sure it is fine and update all the nodes + // so that we do not use the generic VSELECT anymore. Otherwise, we may + // perform wrong optimizations as we messed with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are - // set properly. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. + // Check all uses of the condition operand to check whether it will be + // consumed by non-BLEND instructions. Those may require that all bits + // are set properly. + for (SDNode *U : Cond->uses()) { + // TODO: Add other opcodes eventually lowered into BLEND. + if (U->getOpcode() != ISD::VSELECT) return SDValue(); + } - // Update all the users of the condition, before committing the change, - // so that the VSELECT optimizations that expect the correct vector - // boolean value will not be triggered. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - DAG.ReplaceAllUsesOfValueWith( - SDValue(*I, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), - Cond, I->getOperand(1), I->getOperand(2))); + // Update all users of the condition before committing the change, so + // that the VSELECT optimizations that expect the correct vector boolean + // value will not be triggered. + for (SDNode *U : Cond->uses()) { + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), + U->getValueType(0), Cond, U->getOperand(1), + U->getOperand(2)); + DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); + } DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } - // At this point, only Cond is changed. Change the condition - // just for N to keep the opportunity to optimize all other - // users their own way. - DAG.ReplaceAllUsesOfValueWith( - SDValue(N, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), - TLO.New, N->getOperand(1), N->getOperand(2))); + // Only Cond (rather than other nodes in the computation chain) was + // changed. Change the condition just for N to keep the opportunity to + // optimize all other users their own way. + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB); return SDValue(); } } @@ -29456,7 +30249,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Look for vselects with LHS/RHS being bitcasted from an operation that // can be executed on another type. Push the bitcast to the inputs of // the operation. This exposes opportunities for using masking instructions. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() && + if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() && CondVT.getVectorElementType() == MVT::i1) { if (combineBitcastForMaskedOp(LHS, DAG, DCI)) return SDValue(N, 0); @@ -30208,22 +31001,37 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, } if (!NewMul) { - assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) - && "Both cases that could cause potential overflows should have " - "already been handled."); - if (isPowerOf2_64(MulAmt - 1)) - // (mul x, 2^N + 1) => (add (shl x, N), x) - NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt - 1), DL, - MVT::i8))); - - else if (isPowerOf2_64(MulAmt + 1)) - // (mul x, 2^N - 1) => (sub (shl x, N), x) - NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, - N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt + 1), - DL, MVT::i8)), N->getOperand(0)); + assert(MulAmt != 0 && + MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + "Both cases that could cause potential overflows should have " + "already been handled."); + int64_t SignMulAmt = C->getSExtValue(); + if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) && + (SignMulAmt != -INT64_MAX)) { + int NumSign = SignMulAmt > 0 ? 1 : -1; + bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1); + bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1); + if (IsPowerOf2_64PlusOne) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode( + ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL, + MVT::i8))); + } else if (IsPowerOf2_64MinusOne) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode( + ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL, + MVT::i8)), + N->getOperand(0)); + } + // To negate, subtract the number from zero + if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1) + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); + } } if (NewMul) @@ -30396,42 +31204,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) && - "Unexpected opcode"); +static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || + X86ISD::VSRLI == Opcode) && + "Unexpected shift opcode"); + bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - - // This fails for mask register (vXi1) shifts. - if ((NumBitsPerElt % 8) != 0) - return SDValue(); + assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && + "Unexpected value type"); // Out of range logical bit shifts are guaranteed to be zero. - APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); - if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // Out of range arithmetic bit shifts splat the sign bit. + APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue(); + if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) { + if (LogicalShift) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + else + ShiftVal = NumBitsPerElt - 1; + } // Shift N0 by zero -> N0. if (!ShiftVal) - return N->getOperand(0); + return N0; // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). + // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. + // TODO - support other sra opcodes as needed. + if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && + N0.getOpcode() == X86ISD::VSRAI) + return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); + // We can decode 'whole byte' logical bit shifts as shuffles. - if ((ShiftVal.getZExtValue() % 8) == 0) { + if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } + // Constant Folding. + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (N->isOnlyUserOf(N0.getNode()) && + getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { + assert(EltBits.size() == VT.getVectorNumElements() && + "Unexpected shift value type"); + unsigned ShiftImm = ShiftVal.getZExtValue(); + for (APInt &Elt : EltBits) { + if (X86ISD::VSHLI == Opcode) + Elt = Elt.shl(ShiftImm); + else if (X86ISD::VSRAI == Opcode) + Elt = Elt.ashr(ShiftImm); + else + Elt = Elt.lshr(ShiftImm); + } + return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); + } + + return SDValue(); +} + +static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert( + ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && + N->getValueType(0) == MVT::v8i16)) && + "Unexpected vector insertion"); + + // Attempt to combine PINSRB/PINSRW patterns to a shuffle. + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget); return SDValue(); } @@ -30550,33 +31411,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) return SDValue(); - // Canonicalize XOR to the left. - if (N1.getOpcode() == ISD::XOR) - std::swap(N0, N1); + if (N0.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); - if (N0.getOpcode() != ISD::XOR) - return SDValue(); - - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); + if (N1.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); - N01 = peekThroughBitcasts(N01); - - // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an - // insert_subvector building a 256-bit AllOnes vector. - if (!ISD::isBuildVectorAllOnes(N01.getNode())) { - if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR) - return SDValue(); - - SDValue V1 = N01->getOperand(0); - SDValue V2 = N01->getOperand(1); - if (V1.getOpcode() != ISD::INSERT_SUBVECTOR || - !V1.getOperand(0).isUndef() || - !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) || - !ISD::isBuildVectorAllOnes(V2.getNode())) - return SDValue(); - } - return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1); + return SDValue(); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized @@ -30696,38 +31539,34 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is -/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to -/// eliminate loading the vector constant mask value. This relies on the fact -/// that a PCMP always creates an all-ones or all-zeros bitmask per element. -static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { +/// If this is a zero/all-bits result that is bitwise-anded with a low bits +/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' +/// with a shift-right to eliminate loading the vector constant mask value. +static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + EVT VT0 = Op0.getValueType(); + EVT VT1 = Op1.getValueType(); - // TODO: Use AssertSext to mark any nodes that have the property of producing - // all-ones or all-zeros. Then check for that node rather than particular - // opcodes. - if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) + if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger()) return SDValue(); - // The existence of the PCMP node guarantees that we have the required SSE2 or - // AVX2 for a shift of this vector type, but there is no vector shift by - // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the - // masked compare nodes, so they should not make it here. - EVT VT0 = Op0.getValueType(); - EVT VT1 = Op1.getValueType(); - unsigned EltBitWidth = VT0.getScalarSizeInBits(); - if (VT0 != VT1 || EltBitWidth == 8) + APInt SplatVal; + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || + !SplatVal.isMask()) return SDValue(); - assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256); + if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) + return SDValue(); - APInt SplatVal; - if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) + unsigned EltBitWidth = VT0.getScalarSizeInBits(); + if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); - SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); + unsigned ShiftVal = SplatVal.countTrailingOnes(); + SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } @@ -30747,7 +31586,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; - if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) + if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; EVT VT = N->getValueType(0); @@ -30760,7 +31599,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -30969,7 +31808,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && - N->getOperand(1).getConstantOperandVal(1) == 0 && + isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; @@ -31272,6 +32111,74 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } +/// Check if truncation with saturation form type \p SrcVT to \p DstVT +/// is valid for the given \p Subtarget. +static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + // FIXME: Scalar type may be supported if we move it to vector register. + if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) + return false; + + EVT SrcElVT = SrcVT.getScalarType(); + EVT DstElVT = DstVT.getScalarType(); + if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) + return false; + if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32) + return false; + if (SrcVT.is512BitVector() || Subtarget.hasVLX()) + return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); + return false; +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectUSatPattern(SDValue In, EVT VT) { + if (In.getOpcode() != ISD::UMIN) + return SDValue(); + + //Saturation with truncation. We truncate from InVT to VT. + assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && + "Unexpected types for truncate operation"); + + APInt C; + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : + SDValue(); + } + return SDValue(); +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// The types should allow to use VPMOVUS* instruction on AVX512. +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, + const X86Subtarget &Subtarget) { + if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return SDValue(); + return detectUSatPattern(In, VT); +} + +static SDValue +combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT)) + return SDValue(); + if (auto USatVal = detectUSatPattern(In, VT)) + if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + return SDValue(); +} + /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. @@ -31664,7 +32571,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -31838,6 +32745,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); + if (SDValue Val = + detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -32198,13 +33111,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); - auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) { - // TODO: Add extra cases where we can truncate both inputs for the - // cost of one (or none). - // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y ) + auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) { + unsigned TruncSizeInBits = VT.getScalarSizeInBits(); + + // Repeated operand, so we are only trading one output truncation for + // one input truncation. if (Op0 == Op1) return true; + // See if either operand has been extended from a smaller/equal size to + // the truncation size, allowing a truncation to combine with the extend. + unsigned Opcode0 = Op0.getOpcode(); + if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND || + Opcode0 == ISD::ZERO_EXTEND) && + Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + unsigned Opcode1 = Op1.getOpcode(); + if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND || + Opcode1 == ISD::ZERO_EXTEND) && + Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + // See if either operand is a single use constant which can be constant + // folded. SDValue BC0 = peekThroughOneUseBitcasts(Op0); SDValue BC1 = peekThroughOneUseBitcasts(Op1); return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || @@ -32236,7 +33166,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32252,7 +33182,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32458,6 +33388,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to combine truncation with unsigned saturation. + if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) + return Val; + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { @@ -32804,6 +33738,34 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); } +/// Do target-specific dag combines on X86ISD::ANDNP nodes. +static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + // ANDNP(0, x) -> x + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return N->getOperand(1); + + // ANDNP(x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) + return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); + + EVT VT = N->getValueType(0); + + // Attempt to recursively combine a bitmask ANDNP with shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + + return SDValue(); +} + static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. @@ -33065,13 +34027,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } + if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && + isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { + // Invert and sign-extend a boolean is the same as zero-extend and subtract + // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently + // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1. + // sext (xor Bool, -1) --> sub (zext Bool), 1 + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); + } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -33212,8 +34183,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Optimize x == -y --> x+y == 0 -/// x != -y --> x+y != 0 +/// Try to map a 128-bit or larger integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); + + // We're looking for an oversized integer equality comparison, but ignore a + // comparison with zero because that gets special treatment in EmitTest(). + SDValue X = SetCC->getOperand(0); + SDValue Y = SetCC->getOperand(1); + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + return SDValue(); + + // TODO: Use PXOR + PTEST for SSE4.1 or later? + // TODO: Add support for AVX-512. + EVT VT = SetCC->getValueType(0); + SDLoc DL(SetCC); + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX2())) { + EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + + // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. + // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq + // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne + // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq + // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne + SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); + SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, + MVT::i32); + return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); + } + + return SDValue(); +} + static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); @@ -33222,21 +34232,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc DL(N); - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + if (CC == ISD::SETNE || CC == ISD::SETEQ) { + EVT OpVT = LHS.getValueType(); + // 0-x == y --> x+y == 0 + // 0-x != y --> x+y != 0 + if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && + LHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + // x == 0-y --> x+y == 0 + // x != 0-y --> x+y != 0 + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && + RHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } + if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) + return V; + } + if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { bool IsSEXT0 = @@ -33293,56 +34309,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of performSETCCCombine. It is to materialize "setb reg" -// as "sbb reg,reg", since it can be extended without zext and produces -// an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, - SelectionDAG &DAG, MVT VT) { - if (VT == MVT::i8) - return DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS), - DAG.getConstant(1, DL, VT)); - assert (VT == MVT::i1 && "Unexpected type for SECCC node"); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS)); -} - // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); - if (CC == X86::COND_A) { - // Try to convert COND_A into COND_B in an attempt to facilitate - // materializing "setb reg". - // - // Do not flip "e > c", where "c" is a constant, because Cmp instruction - // cannot take an immediate as its first operand. - // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), - EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); - } - } - - // Materialize "setb reg" as "sbb reg,reg", since it can be extended without - // a zext and produces an all-ones bit which is more useful than 0/1 in some - // cases. - if (CC == X86::COND_B) - return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) return getSETCC(CC, Flags, DL, DAG); @@ -33352,7 +34325,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); @@ -33538,45 +34510,159 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// fold (add Y, (sete X, 0)) -> adc 0, Y -/// (add Y, (setne X, 0)) -> sbb -1, Y -/// (sub (sete X, 0), Y) -> sbb 0, Y -/// (sub (setne X, 0), Y) -> adc -1, Y -static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { +/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit +/// which is more useful than 0/1 in some cases. +static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) { SDLoc DL(N); + // "Condition code B" is also known as "the carry flag" (CF). + SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8); + SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS); + MVT VT = N->getSimpleValueType(0); + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT)); - // Look through ZExts. - SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); - if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) - return SDValue(); + assert(VT == MVT::i1 && "Unexpected type for SETCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB); +} + +/// If this is an add or subtract where one operand is produced by a cmp+setcc, +/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} +/// with CMP+{ADC, SBB}. +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { + bool IsSub = N->getOpcode() == ISD::SUB; + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + + // If this is an add, canonicalize a zext operand to the RHS. + // TODO: Incomplete? What if both sides are zexts? + if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && + Y.getOpcode() != ISD::ZERO_EXTEND) + std::swap(X, Y); + + // Look through a one-use zext. + bool PeekedThroughZext = false; + if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { + Y = Y.getOperand(0); + PeekedThroughZext = true; + } - SDValue SetCC = Ext.getOperand(0); - if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + // If this is an add, canonicalize a setcc operand to the RHS. + // TODO: Incomplete? What if both sides are setcc? + // TODO: Should we allow peeking through a zext of the other operand? + if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && + Y.getOpcode() != X86ISD::SETCC) + std::swap(X, Y); + + if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) return SDValue(); - X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + SDLoc DL(N); + EVT VT = N->getValueType(0); + X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); + + if (CC == X86::COND_B) { + // X + SETB Z --> X + (mask SBB Z, Z) + // X - SETB Z --> X - (mask SBB Z, Z) + // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY? + SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + + if (CC == X86::COND_A) { + SDValue EFLAGS = Y->getOperand(1); + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + } + if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); - SDValue Cmp = SetCC.getOperand(1); + SDValue Cmp = Y.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); - SDValue CmpOp0 = Cmp.getOperand(0); - SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, - DAG.getConstant(1, DL, CmpOp0.getValueType())); + // (cmp Z, 1) sets the carry flag if Z is 0. + SDValue Z = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, + DAG.getConstant(1, DL, Z.getValueType())); + + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); - SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) + // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), - NewCmp); - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, + DAG.getConstant(-1ULL, DL, VT), NewCmp); + + // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) + // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, + DAG.getConstant(0, DL, VT), NewCmp); +} + +static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue MulOp = N->getOperand(0); + SDValue Phi = N->getOperand(1); + + if (MulOp.getOpcode() != ISD::MUL) + std::swap(MulOp, Phi); + if (MulOp.getOpcode() != ISD::MUL) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode)) + return SDValue(); + + EVT VT = N->getValueType(0); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + unsigned VectorSize = VT.getVectorNumElements() * 16; + // If the vector size is less than 128, or greater than the supported RegSize, + // do not use PMADD. + if (VectorSize < 128 || VectorSize > RegSize) + return SDValue(); + + SDLoc DL(N); + EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements()); + EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + VT.getVectorNumElements() / 2); + + // Shrink the operands of mul. + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); + + // Madd vector size is half of the original vector size + SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1); + // Fill the rest of the output with 0 + SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); + return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, @@ -33656,6 +34742,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (Flags->hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; + if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) + return MAdd; } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); @@ -33667,7 +34755,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -33700,36 +34788,44 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + SDLoc DL(N); unsigned Opcode = N->getOpcode(); MVT VT = N->getSimpleValueType(0); MVT SVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = SVT.getSizeInBits(); + SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); - unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); + unsigned InputBits = OpEltSizeInBits * NumElts; // Perform any constant folding. // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - unsigned NumDstElts = VT.getVectorNumElements(); - SmallBitVector Undefs(NumDstElts, false); - SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0)); - for (unsigned i = 0; i != NumDstElts; ++i) { - SDValue OpElt = Op.getOperand(i); - if (OpElt.getOpcode() == ISD::UNDEF) { - Undefs[i] = true; + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { + APInt Undefs(NumElts, 0); + SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0)); + bool IsZEXT = + (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + Undefs.setBit(i); continue; } - APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); - Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits()) - : Cst.sextOrTrunc(SVT.getSizeInBits()); + Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) + : EltBits[i].sextOrTrunc(EltSizeInBits); } return getConstVector(Vals, Undefs, VT, DAG, DL); } @@ -33829,7 +34925,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); if (N->getOpcode() == X86ISD::PCMPGT) return getZeroVector(VT, Subtarget, DAG, DL); } @@ -33837,6 +34933,98 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = N->getSimpleValueType(0); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // If this is an insert of an extract, combine to a shuffle. Don't do this + // if the insert or extract can be represented with a subvector operation. + if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && + SubVec.getOperand(0).getSimpleValueType() == OpVT && + (IdxVal != 0 || !Vec.isUndef())) { + int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue(); + if (ExtIdxVal != 0) { + int VecNumElts = OpVT.getVectorNumElements(); + int SubVecNumElts = SubVecVT.getVectorNumElements(); + SmallVector<int, 64> Mask(VecNumElts); + // First create an identity shuffle mask. + for (int i = 0; i != VecNumElts; ++i) + Mask[i] = i; + // Now insert the extracted portion. + for (int i = 0; i != SubVecNumElts; ++i) + Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; + + return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); + } + } + + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte + // load: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr + 16), Elts/2) + // --> load32 addr + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr + 32), Elts/2) + // --> load64 addr + // or a 16-byte or 32-byte broadcast: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr), Elts/2) + // --> X86SubVBroadcast(load16 addr) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr), Elts/2) + // --> X86SubVBroadcast(load32 addr) + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = {SubVec2, SubVec}; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } + } + // If lower/upper loads are the same and the only users of the load, then + // lower to a VBROADCASTF128/VBROADCASTI128/etc. + if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { + if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && + SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); + } + } + // If this is subv_broadcast insert into both halves, use a larger + // subv_broadcast. + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, + SubVec.getOperand(0)); + } + } + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -33845,6 +35033,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case X86ISD::PEXTRW: + case X86ISD::PEXTRB: + return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + case ISD::INSERT_SUBVECTOR: + return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); @@ -33870,6 +35063,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: @@ -33884,12 +35078,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); - case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); - case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); + case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); + case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::VSHLI: - case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); + case X86ISD::VSRAI: + case X86ISD::VSRLI: + return combineVectorShiftImm(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + case X86ISD::PINSRB: + case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::PALIGNR: @@ -34717,10 +35917,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return Res; } - // 'A' means EAX + EDX. + // 'A' means [ER]AX + [ER]DX. if (Constraint == "A") { - Res.first = X86::EAX; - Res.second = &X86::GR32_ADRegClass; + if (Subtarget.is64Bit()) { + Res.first = X86::RAX; + Res.second = &X86::GR64_ADRegClass; + } else if (Subtarget.is32Bit()) { + Res.first = X86::EAX; + Res.second = &X86::GR32_ADRegClass; + } else if (Subtarget.is16Bit()) { + Res.first = X86::AX; + Res.second = &X86::GR16_ADRegClass; + } else { + llvm_unreachable("Expecting 64, 32 or 16 bit subtarget"); + } return Res; } return Res; @@ -34812,7 +36022,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. @@ -34820,8 +36030,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize); + bool OptSize = + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } |