diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 1207 |
1 files changed, 1035 insertions, 172 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b3c48862898ff..95dbb61766874 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16,6 +16,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86ISelLowering.h" +#include "X86ShuffleDecode.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/CallingConv.h" @@ -343,8 +344,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); - if (!Subtarget->hasSSE2()) - setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); + // We may not have a libcall for MEMBARRIER so we should lower this. + setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); + // On X86 and X86-64, atomic operations are lowered to locked instructions. // Locked instructions, in turn, have implicit fence semantics (all memory // operations are flushed before issuing the locked instruction, and they @@ -837,6 +839,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); + // Can turn SHL into an integer multiply. + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); + // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -866,6 +872,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); + addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v8i32, Legal); @@ -877,7 +884,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); - //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); @@ -1189,6 +1196,50 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; } +std::pair<const TargetRegisterClass*, uint8_t> +X86TargetLowering::findRepresentativeClass(EVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: + RRC = (Subtarget->is64Bit() + ? X86::GR64RegisterClass : X86::GR32RegisterClass); + break; + case MVT::v8i8: case MVT::v4i16: + case MVT::v2i32: case MVT::v1i64: + RRC = X86::VR64RegisterClass; + break; + case MVT::f32: case MVT::f64: + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: + case MVT::v4f64: + RRC = X86::VR128RegisterClass; + break; + } + return std::make_pair(RRC, Cost); +} + +unsigned +X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; + switch (RC->getID()) { + default: + return 0; + case X86::GR32RegClassID: + return 4 - FPDiff; + case X86::GR64RegClassID: + return 8 - FPDiff; + case X86::VR128RegClassID: + return Subtarget->is64Bit() ? 10 : 4; + case X86::VR64RegClassID: + return 4; + } +} + bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) @@ -1259,6 +1310,19 @@ X86TargetLowering::LowerReturn(SDValue Chain, CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; + EVT ValVT = ValToCopy.getValueType(); + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((ValVT == MVT::f32 || ValVT == MVT::f64) && + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + if (ValVT == MVT::f64 && + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. @@ -1276,14 +1340,20 @@ X86TargetLowering::LowerReturn(SDValue Chain, // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { - EVT ValVT = ValToCopy.getValueType(); if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); - if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) - ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + ValToCopy); + + // If we don't have SSE2 available, convert to v4f32 so the generated + // register is legal. + if (!Subtarget->hasSSE2()) + ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); + } } } - + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); } @@ -1570,6 +1640,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = X86::FR32RegisterClass; else if (RegVT == MVT::f64) RC = X86::FR64RegisterClass; + else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) + RC = X86::VR256RegisterClass; else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) RC = X86::VR128RegisterClass; else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) @@ -1937,6 +2009,19 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (isVarArg && Subtarget->isTargetWin64()) { + // Win64 ABI requires argument XMM reg to be copied to the corresponding + // shadow reg if callee is a varargs function. + unsigned ShadowReg = 0; + switch (VA.getLocReg()) { + case X86::XMM0: ShadowReg = X86::RCX; break; + case X86::XMM1: ShadowReg = X86::RDX; break; + case X86::XMM2: ShadowReg = X86::R8; break; + case X86::XMM3: ShadowReg = X86::R9; break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); + } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (StackPtr.getNode() == 0) @@ -1990,7 +2075,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } - if (Is64Bit && isVarArg) { + if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -1999,7 +2084,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. - // FIXME: Verify this on Win64 // Count the number of XMM registers allocated. static const unsigned XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, @@ -2165,8 +2249,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (!isTailCall && Subtarget->isPICStyleGOT()) Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); - // Add an implicit use of AL for x86 vararg functions. - if (Is64Bit && isVarArg) + // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. + if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); if (InFlag.getNode()) @@ -2356,8 +2440,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (RegInfo->needsStackRealignment(MF)) return false; - // Do not sibcall optimize vararg calls unless the call site is not passing any - // arguments. + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. if (isVarArg && !Outs.empty()) return false; @@ -2493,6 +2577,112 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { // Other Lowering Hooks //===----------------------------------------------------------------------===// +static bool MayFoldLoad(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); +} + +static bool MayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +} + +static bool isTargetShuffle(unsigned Opcode) { + switch(Opcode) { + default: return false; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFPD: + case X86ISD::SHUFPS: + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + return true; + } + return false; +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + return DAG.getNode(Opc, dl, VT, V1); + } + + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); + } + + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::SHUFPD: + case X86ISD::SHUFPS: + return DAG.getNode(Opc, dl, VT, V1, V2, + DAG.getConstant(TargetMask, MVT::i8)); + } + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + return DAG.getNode(Opc, dl, VT, V1, V2); + } + return SDValue(); +} SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -3347,18 +3537,27 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest - // type. This ensures they get CSE'd. + // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 64) { // MMX SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - } else if (HasSSE2) { // SSE2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } else { // SSE1 + } else if (VT.getSizeInBits() == 128) { + if (HasSSE2) { // SSE2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else { // SSE1 + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + } + } else if (VT.getSizeInBits() == 256) { // AVX + // 256-bit logic and arithmetic instructions in AVX are + // all floating-point, no support for integer ops. Default + // to emitting fp zeroed vectors then. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } @@ -3372,9 +3571,9 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { // type. This ensures they get CSE'd. SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (VT.getSizeInBits() == 64) // MMX + if (VT.getSizeInBits() == 64) // MMX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - else // SSE + else // SSE Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } @@ -3439,9 +3638,8 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. -static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, - bool HasSSE2) { +/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. +static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { if (SV->getValueType(0).getVectorNumElements() <= 4) return SDValue(SV, 0); @@ -3488,68 +3686,253 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); } -/// getNumOfConsecutiveZeros - Return the number of elements in a result of -/// a shuffle that is zero. -static -unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, - bool Low, SelectionDAG &DAG) { - unsigned NumZeros = 0; - for (int i = 0; i < NumElems; ++i) { - unsigned Index = Low ? i : NumElems-i-1; - int Idx = SVOp->getMaskElt(Index); - if (Idx < 0) { - ++NumZeros; - continue; - } - SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); - if (Elt.getNode() && X86::isZeroNode(Elt)) - ++NumZeros; - else +/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// element of the result of the vector shuffle. +SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, + unsigned Depth) { + if (Depth == 6) + return SDValue(); // Limit search depth. + + SDValue V = SDValue(N, 0); + EVT VT = V.getValueType(); + unsigned Opcode = V.getOpcode(); + + // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. + if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { + Index = SV->getMaskElt(Index); + + if (Index < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + int NumElems = VT.getVectorNumElements(); + SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); + } + + // Recurse into target specific vector shuffles to find scalars. + if (isTargetShuffle(Opcode)) { + int NumElems = VT.getVectorNumElements(); + SmallVector<unsigned, 16> ShuffleMask; + SDValue ImmN; + + switch(Opcode) { + case X86ISD::SHUFPS: + case X86ISD::SHUFPD: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPSMask(NumElems, + cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + DecodePUNPCKHMask(NumElems, ShuffleMask); break; + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + DecodeUNPCKHPMask(NumElems, ShuffleMask); + break; + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + DecodePUNPCKLMask(NumElems, ShuffleMask); + break; + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + DecodeUNPCKLPMask(NumElems, ShuffleMask); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, ShuffleMask); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, ShuffleMask); + break; + case X86ISD::PSHUFD: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(NumElems, + cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: { + // The index 0 always comes from the first element of the second source, + // this is why MOVSS and MOVSD are used in the first place. The other + // elements come from the other positions of the first source vector. + unsigned OpNum = (Index == 0) ? 1 : 0; + return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, + Depth+1); + } + default: + assert("not implemented for target shuffle node"); + return SDValue(); + } + + Index = ShuffleMask[Index]; + if (Index < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, + Depth+1); } - return NumZeros; -} -/// isVectorShift - Returns true if the shuffle can be implemented as a -/// logical left or right shift of a vector. -/// FIXME: split into pslldqi, psrldqi, palignr variants. -static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + // Actual nodes that may contain scalar elements + if (Opcode == ISD::BIT_CONVERT) { + V = V.getOperand(0); + EVT SrcVT = V.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); - isLeft = true; - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); - if (!NumZeros) { - isLeft = false; - NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); - if (!NumZeros) - return false; + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) + return SDValue(); + } + + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Index == 0) ? V.getOperand(0) + : DAG.getUNDEF(VT.getVectorElementType()); + + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Index); + + return SDValue(); +} + +/// getNumOfConsecutiveZeros - Return the number of elements of a vector +/// shuffle operation which come from a consecutively from a zero. The +/// search can start in two diferent directions, from left or right. +static +unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, + bool ZerosFromLeft, SelectionDAG &DAG) { + int i = 0; + + while (i < NumElems) { + unsigned Index = ZerosFromLeft ? i : NumElems-i-1; + SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); + if (!(Elt.getNode() && + (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + break; + ++i; } + + return i; +} + +/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to +/// MaskE correspond consecutively to elements from one of the vector operands, +/// starting from its index OpIdx. Also tell OpNum which source vector operand. +static +bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, + int OpIdx, int NumElems, unsigned &OpNum) { bool SeenV1 = false; bool SeenV2 = false; - for (unsigned i = NumZeros; i < NumElems; ++i) { - unsigned Val = isLeft ? (i - NumZeros) : i; - int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); - if (Idx_ < 0) + + for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { + int Idx = SVOp->getMaskElt(i); + // Ignore undef indicies + if (Idx < 0) continue; - unsigned Idx = (unsigned) Idx_; + if (Idx < NumElems) SeenV1 = true; - else { - Idx -= NumElems; + else SeenV2 = true; - } - if (Idx != Val) + + // Only accept consecutive elements from the same vector + if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) return false; } - if (SeenV1 && SeenV2) + + OpNum = SeenV1 ? 0 : 1; + return true; +} + +/// isVectorShiftRight - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + false /* check zeros from right */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // V1 = {X, A, B, C} 0 + // \ \ \ / + // vector_shuffle V1, V2 <1, 2, 3, X> + // + if (!isShuffleMaskConsecutive(SVOp, + 0, // Mask Start Index + NumElems-NumZeros-1, // Mask End Index + NumZeros, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? + return false; + + isLeft = false; + ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); + return true; +} + +/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + true /* check zeros from left */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // 0 { A, B, X, X } = V2 + // / \ / / + // vector_shuffle V1, V2 <X, X, 4, 5> + // + if (!isShuffleMaskConsecutive(SVOp, + NumZeros, // Mask Start Index + NumElems-1, // Mask End Index + 0, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? return false; - ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); + isLeft = true; ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); return true; } +/// isVectorShift - Returns true if the shuffle can be implemented as a +/// logical left or right shift of a vector. +static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || + isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) + return true; + + return false; +} /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// @@ -3779,9 +4162,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - // All zero's are handled with pxor, all one's are handled with pcmpeqd. - if (ISD::isBuildVectorAllZeros(Op.getNode()) - || ISD::isBuildVectorAllOnes(Op.getNode())) { + // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. + // All one's are handled with pcmpeqd. In AVX, zero's are handled with + // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd + // is present, so AllOnes is ignored. + if (ISD::isBuildVectorAllZeros(Op.getNode()) || + (Op.getValueType().getSizeInBits() != 256 && + ISD::isBuildVectorAllOnes(Op.getNode()))) { // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are // eliminated on x86-32 hosts. @@ -3819,10 +4206,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - if (NumNonZero == 0) { - // All undef vector. Return an UNDEF. All zero vectors were handled above. + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NumNonZero == 0) return DAG.getUNDEF(VT); - } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { @@ -3960,7 +4346,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + *this); if (V.getNode()) return V; } @@ -4014,28 +4400,51 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (LD.getNode()) return LD; - // For SSE 4.1, use inserts into undef. + // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { - V[0] = DAG.getUNDEF(VT); - for (unsigned i = 0; i < NumElems; ++i) - if (Op.getOperand(i).getOpcode() != ISD::UNDEF) - V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], + SDValue Result; + if (Op.getOperand(0).getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + else + Result = DAG.getUNDEF(VT); + + for (unsigned i = 1; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i)); - return V[0]; + } + return Result; } - // Otherwise, expand into a number of unpckl* - // e.g. for v4f32 + // Otherwise, expand into a number of unpckl*, start by extending each of + // our (non-undef) elements to the full vector width with the element in the + // bottom slot of the vector (which generates no code for SSE). + for (unsigned i = 0; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() != ISD::UNDEF) + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + else + V[i] = DAG.getUNDEF(VT); + } + + // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> - for (unsigned i = 0; i < NumElems; ++i) - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); - NumElems >>= 1; - while (NumElems != 0) { - for (unsigned i = 0; i < NumElems; ++i) - V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); - NumElems >>= 1; + unsigned EltStride = NumElems >> 1; + while (EltStride != 0) { + for (unsigned i = 0; i < EltStride; ++i) { + // If V[i+EltStride] is undef and this is the first round of mixing, + // then it is safe to just drop this shuffle: V[i] is already in the + // right place, the one element (since it's the first round) being + // inserted as undef can be dropped. This isn't safe for successive + // rounds because they will permute elements within both vectors. + if (V[i+EltStride].getOpcode() == ISD::UNDEF && + EltStride == NumElems/2) + continue; + + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); + } + EltStride >>= 1; } return V[0]; } @@ -4074,10 +4483,10 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -static -SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, - const X86TargetLowering &TLI) { +SDValue +X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, + SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -4128,7 +4537,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (TLI.getSubtarget()->hasSSSE3()) { + if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { BestLoQuad = InputQuads.find_first(); BestHiQuad = InputQuads.find_next(BestLoQuad); @@ -4187,15 +4596,21 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, // If we've eliminated the use of V2, and the new mask is a pshuflw or // pshufhw, that's as cheap as it gets. Return the new shuffle. if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { - return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, + unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; + unsigned TargetMask = 0; + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); + TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): + X86::getShufflePSHUFLWImmediate(NewV.getNode()); + V1 = NewV.getOperand(0); + return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } } // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (TLI.getSubtarget()->hasSSSE3()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -4262,6 +4677,12 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, MaskV.push_back(i); NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) + NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, + NewV.getOperand(0), + X86::getShufflePSHUFLWImmediate(NewV.getNode()), + DAG); } // If BestHi >= 0, generate a pshufhw to put the high elements in order, @@ -4284,6 +4705,12 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) + NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, + NewV.getOperand(0), + X86::getShufflePSHUFHWImmediate(NewV.getNode()), + DAG); } // In case BestHi & BestLo were both -1, which means each quadword has a word @@ -4473,7 +4900,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SDValue V2 = SVOp->getOperand(1); unsigned NumElems = VT.getVectorNumElements(); unsigned NewWidth = (NumElems == 4) ? 2 : 4; - EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); + EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32; EVT NewVT = MaskVT; switch (VT.getSimpleVT().SimpleTy) { default: assert(false && "Unexpected!"); @@ -4697,6 +5124,129 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); } +static bool MayFoldVectorLoad(SDValue V) { + if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) + V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) + V = V.getOperand(0); + if (MayFoldLoad(V)) + return true; + return false; +} + +static +SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, + bool HasSSE2) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert(VT != MVT::v2i64 && "unsupported shuffle type"); + + if (HasSSE2 && VT == MVT::v2f64) + return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); + + // v4f32 or v4i32 + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); +} + +static +SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert((VT == MVT::v4i32 || VT == MVT::v4f32) && + "unsupported shuffle type"); + + if (V2.getOpcode() == ISD::UNDEF) + V2 = V1; + + // v4i32 or v4f32 + return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); +} + +static +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second + // operand of these instructions is only memory, so check if there's a + // potencial load folding here, otherwise use SHUFPS or MOVSD to match the + // same masks. + bool CanFoldLoad = false; + + // Trivial case, when V2 comes from a load. + if (MayFoldVectorLoad(V2)) + CanFoldLoad = true; + + // When V1 is a load, it can be folded later into a store in isel, example: + // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) + // turns into: + // (MOVLPSmr addr:$src1, VR128:$src2) + // So, recognize this potential and also use MOVLPS or MOVLPD + if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + CanFoldLoad = true; + + if (CanFoldLoad) { + if (HasSSE2 && NumElems == 2) + return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); + + if (NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + } + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + // movl and movlp will both match v2i64, but v2i64 is never matched by + // movl earlier because we make it strict to avoid messing with the movlp load + // folding logic (see the code above getMOVLP call). Match it here then, + // this is horrible, but will stay like this until we move all shuffle + // matching to x86 specific nodes. Note that for the 1st condition all + // types are matched with movsd. + if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + else if (HasSSE2) + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + + + assert(VT != MVT::v4i32 && "unsupported shuffle type"); + + // Invert the operand order and use SHUFPS to match it. + return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, + X86::getShuffleSHUFImmediate(SVOp), DAG); +} + +static inline unsigned getUNPCKLOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: return X86ISD::PUNPCKLDQ; + case MVT::v2i64: return X86ISD::PUNPCKLQDQ; + case MVT::v4f32: return X86ISD::UNPCKLPS; + case MVT::v2f64: return X86ISD::UNPCKLPD; + case MVT::v16i8: return X86ISD::PUNPCKLBW; + case MVT::v8i16: return X86ISD::PUNPCKLWD; + default: + llvm_unreachable("Unknow type for unpckl"); + } + return 0; +} + +static inline unsigned getUNPCKHOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: return X86ISD::PUNPCKHDQ; + case MVT::v2i64: return X86ISD::PUNPCKHQDQ; + case MVT::v4f32: return X86ISD::UNPCKHPS; + case MVT::v2f64: return X86ISD::UNPCKHPD; + case MVT::v16i8: return X86ISD::PUNPCKHBW; + case MVT::v8i16: return X86ISD::PUNPCKHWD; + default: + llvm_unreachable("Unknow type for unpckh"); + } + return 0; +} + SDValue X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); @@ -4710,6 +5260,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; + bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); + bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); + MachineFunction &MF = DAG.getMachineFunction(); + bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); if (isZeroShuffle(SVOp)) return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); @@ -4718,7 +5272,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (SVOp->isSplat()) { if (isMMX || NumElems < 4) return Op; - return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); + return PromoteSplat(SVOp, DAG); } // If the shuffle can be profitably rewritten as a narrower shuffle, then @@ -4746,8 +5300,35 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } } - if (X86::isPSHUFDMask(SVOp)) - return Op; + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and + // unpckh_undef). Only use pshufd if speed is more important than size. + if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) + if (VT != MVT::v2i64 && VT != MVT::v2f64) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) + if (VT != MVT::v2i64 && VT != MVT::v2f64) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + + if (X86::isPSHUFDMask(SVOp)) { + // The actual implementation will match the mask in the if above and then + // during isel it can match several different instructions, not only pshufd + // as its name says, sad but true, emulate the behavior for now... + if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); + + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, + TargetMask, DAG); + + if (VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, + TargetMask, DAG); + } // Check if this can be converted into a logical shift. bool isLeft = false; @@ -4768,17 +5349,32 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return V2; if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!isMMX) - return Op; + if (!isMMX && !X86::isMOVLPMask(SVOp)) { + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + + if (VT == MVT::v4i32 || VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + } } // FIXME: fold these into legal mask. - if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || - X86::isMOVSLDUPMask(SVOp) || - X86::isMOVHLPSMask(SVOp) || - X86::isMOVLHPSMask(SVOp) || - X86::isMOVLPMask(SVOp))) - return Op; + if (!isMMX) { + if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); + + if (X86::isMOVHLPSMask(SVOp)) + return getMOVHighToLow(Op, dl, DAG); + + if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); + + if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); + + if (X86::isMOVLPMask(SVOp)) + return getMOVLP(Op, dl, DAG, HasSSE2); + } if (ShouldXformToMOVHLPS(SVOp) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) @@ -4818,11 +5414,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (X86::isUNPCKL_v_undef_Mask(SVOp) || - X86::isUNPCKH_v_undef_Mask(SVOp) || - X86::isUNPCKLMask(SVOp) || - X86::isUNPCKHMask(SVOp)) - return Op; + if (X86::isUNPCKLMask(SVOp)) + return (isMMX) ? + Op : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + + if (X86::isUNPCKHMask(SVOp)) + return (isMMX) ? + Op : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first @@ -4844,11 +5442,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // FIXME: this seems wrong. SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); - if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || - X86::isUNPCKH_v_undef_Mask(NewSVOp) || - X86::isUNPCKLMask(NewSVOp) || - X86::isUNPCKHMask(NewSVOp)) - return NewOp; + + if (X86::isUNPCKLMask(NewSVOp)) + return (isMMX) ? + NewOp : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + + if (X86::isUNPCKHMask(NewSVOp)) + return (isMMX) ? + NewOp : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); } // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. @@ -4857,15 +5458,52 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) return CommuteVectorShuffle(SVOp, DAG); - // Check for legal shuffle and return? - SmallVector<int, 16> PermMask; - SVOp->getMask(PermMask); - if (isShuffleMaskLegal(PermMask, VT)) + // The checks below are all present in isShuffleMaskLegal, but they are + // inlined here right now to enable us to directly emit target specific + // nodes, and remove one by one until they don't return Op anymore. + SmallVector<int, 16> M; + SVOp->getMask(M); + + // Very little shuffling can be done for 64-bit vectors right now. + if (VT.getSizeInBits() == 64) + return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ? Op : SDValue(); + + // FIXME: pshufb, blends, shifts. + if (VT.getVectorNumElements() == 2 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isPALIGNRMask(M, VT, Subtarget->hasSSSE3())) return Op; + if (isPSHUFHWMask(M, VT)) + return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, + X86::getShufflePSHUFHWImmediate(SVOp), + DAG); + + if (isPSHUFLWMask(M, VT)) + return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, + X86::getShufflePSHUFLWImmediate(SVOp), + DAG); + + if (isSHUFPMask(M, VT)) { + unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + if (VT == MVT::v4f32 || VT == MVT::v4i32) + return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, + TargetMask, DAG); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, + TargetMask, DAG); + } + + if (X86::isUNPCKL_v_undef_Mask(SVOp)) + if (VT != MVT::v2i64 && VT != MVT::v2f64) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + if (X86::isUNPCKH_v_undef_Mask(SVOp)) + if (VT != MVT::v2i64 && VT != MVT::v2f64) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); if (NewOp.getNode()) return NewOp; } @@ -6922,24 +7560,58 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // ptest intrinsics. The intrinsic these come from are designed to return - // an integer value, not just an instruction so lower it to the ptest - // pattern and a setcc for the result. + // ptest and testp intrinsics. The intrinsic these come from are designed to + // return an integer value, not just an instruction so lower it to the ptest + // or testp pattern and a setcc for the result. case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: - case Intrinsic::x86_sse41_ptestnzc:{ + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestz_256: + case Intrinsic::x86_avx_ptestc_256: + case Intrinsic::x86_avx_ptestnzc_256: + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + case Intrinsic::x86_avx_vtestc_pd_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: { + bool IsTestPacked = false; unsigned X86CC = 0; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestc_pd_256: + IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: + IsTestPacked = true; // Fallthrough case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; @@ -6947,7 +7619,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); + unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; + SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue CC = DAG.getConstant(X86CC, MVT::i8); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); @@ -7110,12 +7783,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Handler = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); - SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, - getPointerTy()); + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, + Subtarget->is64Bit() ? X86::RBP : X86::EBP, + getPointerTy()); unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); - SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, - DAG.getIntPtrConstant(-TD->getPointerSize())); + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, + DAG.getIntPtrConstant(TD->getPointerSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); @@ -7218,7 +7892,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; if (InRegCount > 2) { - report_fatal_error("Nest register in use - reduce number of inreg parameters!"); + report_fatal_error("Nest register in use - reduce number of inreg" + " parameters!"); } } break; @@ -7439,6 +8114,86 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { return Res; } +SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + + LLVMContext *Context = DAG.getContext(); + + assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + + if (VT == MVT::v4i32) { + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + + ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); + + std::vector<Constant*> CV(4, CI); + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8) { + // a = a << 5; + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + Op.getOperand(1), DAG.getConstant(5, MVT::i32)); + + ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); + ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); + + std::vector<Constant*> CVM1(16, CM1); + std::vector<Constant*> CVM2(16, CM2); + Constant *C = ConstantVector::get(CVM1); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); + + // r = pblendv(r, psllw(r & (char16)15, 4), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(4, MVT::i32)); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, M, Op); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + C = ConstantVector::get(CVM2); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, false, false, 16); + + // r = pblendv(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(2, MVT::i32)); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, M, Op); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + // return pblendv(r, r+r, a); + R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); + return R; + } + return SDValue(); +} SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus @@ -7508,6 +8263,50 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return Sum; } +SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ + DebugLoc dl = Op.getDebugLoc(); + + if (!Subtarget->hasSSE2()) { + SDValue Chain = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, + Subtarget->is64Bit() ? MVT::i64 : MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = + DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, + array_lengthof(Ops)); + return SDValue(Res, 0); + } + + unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); + if (!isDev) + return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); + + unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + + // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; + if (!Op1 && !Op2 && !Op3 && Op4) + return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); + + // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; + if (Op1 && !Op2 && !Op3 && !Op4) + return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); + + // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), + // (MFENCE)>; + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); +} + SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { EVT T = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); @@ -7597,6 +8396,7 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -7640,6 +8440,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL_V2I64(Op, DAG); + case ISD::SHL: return LowerSHL(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -7852,6 +8653,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::AND: return "X86ISD::AND"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; + case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::PALIGN: return "X86ISD::PALIGN"; + case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; + case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; + case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; + case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; + case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; + case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; + case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; + case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; + case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; + case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; + case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; + case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; + case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; + case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; + case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; + case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; + case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; + case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; + case X86ISD::MOVSD: return "X86ISD::MOVSD"; + case X86ISD::MOVSS: return "X86ISD::MOVSS"; + case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; + case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; + case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; + case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; + case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; + case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; + case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; + case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; + case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; + case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; + case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; + case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; } @@ -7863,6 +8698,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); + Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) @@ -7882,7 +8718,8 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, return false; // If lower 4G is not available, then we must use rip-relative addressing. - if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) + if ((M != CodeModel::Small || R != Reloc::Static) && + Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } @@ -8368,19 +9205,31 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 -// all of this code can be replaced with that in the .td file. +// or XMM0_V32I8 in AVX all of this code can be replaced with that +// in the .td file. MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { + assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && + "Target must have SSE4.2 or AVX features enabled"); + DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); unsigned Opc; - if (memArg) - Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; - else - Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; + + if (!Subtarget->hasAVX()) { + if (memArg) + Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; + else + Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; + } else { + if (memArg) + Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; + else + Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; + } MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); @@ -8562,7 +9411,8 @@ X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, .addReg(X86::EAX, RegState::Implicit) .addReg(X86::ESP, RegState::Implicit) .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::ESP, RegState::Define | RegState::Implicit); + .addReg(X86::ESP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -8579,6 +9429,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineFunction *F = BB->getParent(); + bool IsWin64 = Subtarget->isTargetWin64(); assert(MI->getOperand(3).isGlobal() && "This should be a global"); @@ -8590,7 +9441,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); - MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); + MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); addDirectMem(MIB, X86::RDI); } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, @@ -8727,12 +9578,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, } // String/text processing lowering. case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: return EmitPCMP(MI, BB, 3, false /* in-mem */); case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: return EmitPCMP(MI, BB, 3, true /* in-mem */); case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: return EmitPCMP(MI, BB, 5, false /* in mem */); case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: return EmitPCMP(MI, BB, 5, true /* in mem */); // Atomic Lowering. @@ -8966,21 +9821,20 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); - ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); if (VT.getSizeInBits() != 128) return SDValue(); SmallVector<SDValue, 16> Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) - Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); - + Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } -/// PerformShuffleCombine - Detect vector gather/scatter index generation -/// and convert it from being a bunch of shuffles and extracts to a simple -/// store and scalar loads to extract the elements. +/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index +/// generation and convert it from being a bunch of shuffles and extracts +/// to a simple store and scalar loads to extract the elements. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { SDValue InputVector = N->getOperand(0); @@ -9030,8 +9884,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, - false, false, 0); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, + 0, false, false, 0); // Replace each use (extract) with a load of the appropriate element. for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), @@ -9045,11 +9899,12 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), + OffsetVal, StackPtr); // Load the scalar. - SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, - NULL, 0, false, false, 0); + SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, + ScalarAddr, NULL, 0, false, false, 0); // Replace the exact with the load. DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); @@ -9087,8 +9942,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. - if (!FiniteOnlyFPMath() && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; @@ -9126,8 +9980,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. - if (!FiniteOnlyFPMath() && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; @@ -9156,8 +10009,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // cause it to handle NaNs incorrectly. if (!UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { - if (!FiniteOnlyFPMath() && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } @@ -9182,8 +10034,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. - if (!FiniteOnlyFPMath() && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; @@ -9193,8 +10044,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // cause it to handle NaNs incorrectly. if (!UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { - if (!FiniteOnlyFPMath() && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } @@ -9905,7 +10755,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; - case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); @@ -9922,6 +10771,28 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case X86ISD::SHUFPS: // Handle all target specific shuffles + case X86ISD::SHUFPD: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLHPS: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); } return SDValue(); @@ -9956,14 +10827,6 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } -static bool MayFoldLoad(SDValue Op) { - return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); -} - -static bool MayFoldIntoStore(SDValue Op) { - return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); -} - /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. |