diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 193 |
1 files changed, 158 insertions, 35 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5303d7a406ad..831e9bdab0e1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,4 +1,4 @@ - + //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure @@ -81,6 +81,12 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); +static cl::opt<bool> MulConstantOptimization( + "mul-constant-optimization", cl::init(true), + cl::desc("Replace 'mul x, Const' with more effective instructions like " + "SHIFT, LEA, etc."), + cl::Hidden); + /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -5810,7 +5816,8 @@ static bool setTargetShuffleZeroElements(SDValue N, // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, - SmallVectorImpl<SDValue> &Ops) { + SmallVectorImpl<SDValue> &Ops, + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -5868,8 +5875,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8); } - if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)) || - NumElts <= SrcExtract.getConstantOperandVal(1)) + if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); @@ -5877,8 +5883,12 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; + unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); + if (NumSrcElts <= SrcIdx) + return false; + Ops.push_back(SrcVec); - Mask.push_back(SrcExtract.getConstantOperandVal(1)); + Mask.push_back(SrcIdx); Mask.append(NumZeros, SM_SentinelZero); Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; @@ -5915,6 +5925,19 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } + case X86ISD::PACKSS: { + // If we know input saturation won't happen we can treat this + // as a truncation shuffle. + if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt || + DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt) + return false; + + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i * 2); + return true; + } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); @@ -5989,9 +6012,10 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, /// Returns true if the target shuffle mask was decoded. static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, - SmallVectorImpl<int> &Mask) { + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG) { if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, Mask, Inputs)) + if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); @@ -6391,6 +6415,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, bool isAfterLegalize) { unsigned NumElems = Elts.size(); @@ -6495,6 +6520,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); + // Don't create 256-bit non-temporal aligned loads without AVX2 as these + // will lower to regular temporal loads and use the cache. + if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && + VT.is256BitVector() && !Subtarget.hasInt256()) + return SDValue(); + if (IsConsecutiveLoad) return CreateLoad(VT, LDBase); @@ -7701,7 +7732,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); - if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false)) + if (SDValue LD = + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } @@ -7825,24 +7857,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // Next, we iteratively mix elements, e.g. for v4f32: - // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> - // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> - // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> - unsigned EltStride = NumElems >> 1; - while (EltStride != 0) { - for (unsigned i = 0; i < EltStride; ++i) { - // If Ops[i+EltStride] is undef and this is the first round of mixing, - // then it is safe to just drop this shuffle: V[i] is already in the - // right place, the one element (since it's the first round) being - // inserted as undef can be dropped. This isn't safe for successive - // rounds because they will permute elements within both vectors. - if (Ops[i+EltStride].isUndef() && - EltStride == NumElems/2) - continue; - - Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]); - } - EltStride >>= 1; + // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> + // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> + // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> + for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { + // Generate scaled UNPCKL shuffle mask. + SmallVector<int, 16> Mask; + for(unsigned i = 0; i != Scale; ++i) + Mask.push_back(i); + for (unsigned i = 0; i != Scale; ++i) + Mask.push_back(NumElems+i); + Mask.append(NumElems - Mask.size(), SM_SentinelUndef); + + for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) + Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } @@ -17177,7 +17205,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Cond == ISD::SETGE || Cond == ISD::SETUGE; bool Invert = Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); - bool FlipSigns = ISD::isUnsignedIntSetCC(Cond); + + // If both operands are known non-negative, then an unsigned compare is the + // same as a signed compare and there's no need to flip signbits. + // TODO: We could check for more general simplifications here since we're + // computing known bits. + bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && + !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); @@ -26741,6 +26775,17 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return Tmp; } + case X86ISD::VSHLI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + if (ShiftVal.uge(VTBits)) + return VTBits; // Shifted all bits out --> zero. + if (ShiftVal.uge(Tmp)) + return 1; // Shifted all sign bits out --> unknown. + return Tmp - ShiftVal.getZExtValue(); + } + case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); @@ -27889,7 +27934,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // Extract target shuffle mask and resolve sentinels and inputs. SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask)) + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return false; assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); @@ -28788,7 +28833,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } if (Elts.size() == VT.getVectorNumElements()) - if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) + if (SDValue LD = + EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) return LD; // For AVX2, we sometimes want to combine @@ -29430,7 +29476,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // Resolve the target shuffle inputs and mask. SmallVector<int, 16> Mask; SmallVector<SDValue, 2> Ops; - if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask)) + if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -31017,6 +31063,77 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, } } +static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, + EVT VT, SDLoc DL) { + + auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { + SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(Mult, DL, VT)); + Result = DAG.getNode(ISD::SHL, DL, VT, Result, + DAG.getConstant(Shift, DL, MVT::i8)); + Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, + N->getOperand(0)); + return Result; + }; + + auto combineMulMulAddOrSub = [&](bool isAdd) { + SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(9, DL, VT)); + Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); + Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, + N->getOperand(0)); + return Result; + }; + + switch (MulAmt) { + default: + break; + case 11: + // mul x, 11 => add ((shl (mul x, 5), 1), x) + return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); + case 21: + // mul x, 21 => add ((shl (mul x, 5), 2), x) + return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); + case 22: + // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); + case 19: + // mul x, 19 => sub ((shl (mul x, 5), 2), x) + return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); + case 13: + // mul x, 13 => add ((shl (mul x, 3), 2), x) + return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); + case 23: + // mul x, 13 => sub ((shl (mul x, 3), 3), x) + return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); + case 14: + // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); + case 26: + // mul x, 26 => sub ((mul (mul x, 9), 3), x) + return combineMulMulAddOrSub(/*isAdd*/ false); + case 28: + // mul x, 28 => add ((mul (mul x, 9), 3), x) + return combineMulMulAddOrSub(/*isAdd*/ true); + case 29: + // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulMulAddOrSub(/*isAdd*/ true)); + case 30: + // mul x, 30 => sub (sub ((shl x, 5), x), x) + return DAG.getNode( + ISD::SUB, DL, VT, + DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(5, DL, MVT::i8)), + N->getOperand(0)), + N->getOperand(0)); + } + return SDValue(); +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -31026,6 +31143,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); + if (!MulConstantOptimization) + return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -31081,7 +31200,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); - } + } else if (!Subtarget.slowLEA()) + NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); if (!NewMul) { assert(MulAmt != 0 && @@ -32381,15 +32501,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation - // into two 16-byte operations. + // into two 16-byte operations. Also split non-temporal aligned loads on + // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && - TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, - AddressSpace, Alignment, &Fast) && !Fast) { + ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || + (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -35097,7 +35219,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, + Subtarget, false)) return Ld; } } |