diff options
Diffstat (limited to 'lib/Target/PowerPC/PPCISelLowering.cpp')
| -rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.cpp | 1087 |
1 files changed, 833 insertions, 254 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 51ff8a5cf77e..39608cb74bee 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -251,12 +251,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UREM, MVT::i64, Expand); } - if (Subtarget.hasP9Vector()) { - setOperationAction(ISD::ABS, MVT::v4i32, Legal); - setOperationAction(ISD::ABS, MVT::v8i16, Legal); - setOperationAction(ISD::ABS, MVT::v16i8, Legal); - } - // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -323,12 +317,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // to speed up scalar BSWAP64. // CTPOP or CTTZ were introduced in P8/P9 respectively setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - if (Subtarget.isISA3_0()) { + if (Subtarget.hasP9Vector()) setOperationAction(ISD::BSWAP, MVT::i64 , Custom); + else + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); + if (Subtarget.isISA3_0()) { setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); } else { - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); } @@ -554,6 +550,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); + setOperationAction(ISD::ABS, VT, Custom); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -586,6 +583,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); + setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::SELECT_CC, VT, Promote); AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); setOperationAction(ISD::STORE, VT, Promote); @@ -626,7 +624,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); @@ -659,6 +656,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + // Without hasP8Altivec set, v2i64 SMAX isn't available. + // But ABS custom lowering requires SMAX support. + if (!Subtarget.hasP8Altivec()) + setOperationAction(ISD::ABS, MVT::v2i64, Expand); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); @@ -727,12 +729,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); @@ -792,12 +788,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); - // Vector operation legalization checks the result type of - // SIGN_EXTEND_INREG, overall legalization checks the inner type. - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + // Custom handling for partial vectors of integers converted to + // floating point. We already have optimal handling for v2i32 through + // the DAG combine, so those aren't necessary. + setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::FNEG, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Legal); @@ -1055,6 +1056,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); @@ -1076,6 +1078,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::TRUNCATE); + if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SETCC); @@ -1088,6 +1092,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::FSQRT); } + if (Subtarget.hasP9Altivec()) { + setTargetDAGCombine(ISD::ABS); + setTargetDAGCombine(ISD::VSELECT); + } + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -1348,6 +1357,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; + case PPCISD::VABSD: return "PPCISD::VABSD"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; @@ -1355,6 +1365,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; + case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; } return nullptr; } @@ -2214,11 +2225,10 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably // disjoint. - KnownBits LHSKnown, RHSKnown; - DAG.computeKnownBits(N.getOperand(0), LHSKnown); + KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); if (LHSKnown.Zero.getBoolValue()) { - DAG.computeKnownBits(N.getOperand(1), RHSKnown); + KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1)); // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { @@ -2317,8 +2327,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. - KnownBits LHSKnown; - DAG.computeKnownBits(N.getOperand(0), LHSKnown); + KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't @@ -2405,6 +2414,28 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, return true; } +/// Returns true if we should use a direct load into vector instruction +/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. +static bool usePartialVectorLoads(SDNode *N) { + if (!N->hasOneUse()) + return false; + + // If there are any other uses other than scalar to vector, then we should + // keep it as a scalar load -> direct move pattern to prevent multiple + // loads. Currently, only check for i64 since we have lxsd/lfd to do this + // efficiently, but no update equivalent. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + EVT MemVT = LD->getMemoryVT(); + if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { + SDNode *User = *(LD->use_begin()); + if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) + return true; + } + } + + return false; +} + /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. @@ -2430,6 +2461,13 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, } else return false; + // Do not generate pre-inc forms for specific loads that feed scalar_to_vector + // instructions because we can fold these into a more efficient instruction + // instead, (such as LXSD). + if (isLoad && usePartialVectorLoads(N)) { + return false; + } + // PowerPC doesn't have preinc load/store instructions for vectors (except // for QPX, which does have preinc r+r forms). if (VT.isVector()) { @@ -2674,7 +2712,8 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual BlockAddress is stored in the TOC. - if (Subtarget.isSVR4ABI() && isPositionIndependent()) { + if (Subtarget.isSVR4ABI() && + (Subtarget.isPPC64() || isPositionIndependent())) { if (Subtarget.isPPC64()) setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); @@ -3480,9 +3519,14 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // Argument stored in memory. assert(VA.isMemLoc()); + // Get the extended size of the argument type in stack unsigned ArgSize = VA.getLocVT().getStoreSize(); - int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(), - isImmutable); + // Get the actual size of the argument type + unsigned ObjSize = VA.getValVT().getStoreSize(); + unsigned ArgOffset = VA.getLocMemOffset(); + // Stack objects in PPC32 are right justified. + ArgOffset += ArgSize - ObjSize; + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); @@ -3935,7 +3979,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); - /* fall through */ + LLVM_FALLTHROUGH; case MVT::v4f64: case MVT::v4i1: @@ -5053,9 +5097,15 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live // into the call. - if (isSVR4ABI && isPPC64 && !isPatchPoint) { + // We do need to reserve X2 to appease the verifier for the PATCHPOINT. + if (isSVR4ABI && isPPC64) { setUsesTOCBasePtr(DAG); - Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + + // We cannot add X2 as an operand here for PATCHPOINT, because there is no + // way to mark dependencies as implicit here. We will add the X2 dependency + // in EmitInstrWithCustomInserter. + if (!isPatchPoint) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); } return CallOpc; @@ -5437,10 +5487,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( Arg = PtrOff; } - if (VA.isRegLoc()) { - if (Arg.getValueType() == MVT::i1) - Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); + // When useCRBits() is true, there can be i1 arguments. + // It is because getRegisterType(MVT::i1) => MVT::i1, + // and for other integer types getRegisterType() => MVT::i32. + // Extend i1 and ensure callee will get i32. + if (Arg.getValueType() == MVT::i1) + Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + dl, MVT::i32, Arg); + if (VA.isRegLoc()) { seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); @@ -6073,7 +6128,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); - /* fall through */ + LLVM_FALLTHROUGH; case MVT::v4f64: case MVT::v4i1: { bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; @@ -7228,10 +7283,83 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, return FP; } +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { + + EVT VecVT = Vec.getValueType(); + assert(VecVT.isVector() && "Expected a vector type."); + assert(VecVT.getSizeInBits() < 128 && "Vector is already full width."); + + EVT EltVT = VecVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements(); + SmallVector<SDValue, 16> Ops(NumConcat); + Ops[0] = Vec; + SDValue UndefVec = DAG.getUNDEF(VecVT); + for (unsigned i = 1; i < NumConcat; ++i) + Ops[i] = UndefVec; + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); +} + +SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, + const SDLoc &dl) const { + + unsigned Opc = Op.getOpcode(); + assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) && + "Unexpected conversion type"); + assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) && + "Supports conversions to v2f64/v4f32 only."); + + bool SignedConv = Opc == ISD::SINT_TO_FP; + bool FourEltRes = Op.getValueType() == MVT::v4f32; + + SDValue Wide = widenVec(DAG, Op.getOperand(0), dl); + EVT WideVT = Wide.getValueType(); + unsigned WideNumElts = WideVT.getVectorNumElements(); + MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64; + + SmallVector<int, 16> ShuffV; + for (unsigned i = 0; i < WideNumElts; ++i) + ShuffV.push_back(i + WideNumElts); + + int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2; + int SaveElts = FourEltRes ? 4 : 2; + if (Subtarget.isLittleEndian()) + for (int i = 0; i < SaveElts; i++) + ShuffV[i * Stride] = i; + else + for (int i = 1; i <= SaveElts; i++) + ShuffV[i * Stride - 1] = i - 1; + + SDValue ShuffleSrc2 = + SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT); + SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV); + unsigned ExtendOp = + SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST; + + SDValue Extend; + if (!Subtarget.hasP9Altivec() && SignedConv) { + Arrange = DAG.getBitcast(IntermediateVT, Arrange); + Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange, + DAG.getValueType(Op.getOperand(0).getValueType())); + } else + Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange); + + return DAG.getNode(Opc, dl, Op.getValueType(), Extend); +} + SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + EVT InVT = Op.getOperand(0).getValueType(); + EVT OutVT = Op.getValueType(); + if (OutVT.isVector() && OutVT.isFloatingPoint() && + isOperationCustom(Op.getOpcode(), InVT)) + return LowerINT_TO_FPVector(Op, DAG, dl); + // Conversions to f128 are legal. if (EnableQuadPrecision && (Op.getValueType() == MVT::f128)) return Op; @@ -8454,17 +8582,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); - // If the source for the shuffle is a scalar_to_vector that came from a - // 32-bit load, it will have used LXVWSX so we don't need to splat again. - if (Subtarget.hasP9Vector() && - ((isLittleEndian && SplatIdx == 3) || - (!isLittleEndian && SplatIdx == 0))) { - SDValue Src = V1.getOperand(0); - if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && - Src.getOperand(0).getOpcode() == ISD::LOAD && - Src.getOperand(0).hasOneUse()) - return V1; - } SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, DAG.getConstant(SplatIdx, dl, MVT::i32)); @@ -8913,35 +9030,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(PPC::R2, MVT::i32); } - // We are looking for absolute values here. - // The idea is to try to fit one of two patterns: - // max (a, (0-a)) OR max ((0-a), a) - if (Subtarget.hasP9Vector() && - (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { - SDValue V1 = Op.getOperand(1); - SDValue V2 = Op.getOperand(2); - if (V1.getSimpleValueType() == V2.getSimpleValueType() && - (V1.getSimpleValueType() == MVT::v4i32 || - V1.getSimpleValueType() == MVT::v8i16 || - V1.getSimpleValueType() == MVT::v16i8)) { - if ( V1.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && - V1.getOperand(1) == V2 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); - } - - if ( V2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && - V2.getOperand(1) == V1 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); - } - } - } - // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. int CompareOpc; @@ -9092,30 +9180,6 @@ SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); } -SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int - // instructions), but for smaller types, we need to first extend up to v2i32 - // before doing going farther. - if (Op.getValueType() == MVT::v2i64) { - EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); - if (ExtVT != MVT::v2i32) { - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); - Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, - DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), - ExtVT.getVectorElementType(), 4))); - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); - Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, - DAG.getValueType(MVT::v2i32)); - } - - return Op; - } - - return SDValue(); -} - SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9506,6 +9570,44 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { } } +SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); + + EVT VT = Op.getValueType(); + assert(VT.isVector() && + "Only set vector abs as custom, scalar abs shouldn't reach here!"); + assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8) && + "Unexpected vector element type!"); + assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && + "Current subtarget doesn't support smax v2i64!"); + + // For vector abs, it can be lowered to: + // abs x + // ==> + // y = -x + // smax(x, y) + + SDLoc dl(Op); + SDValue X = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); + + // SMAX patch https://reviews.llvm.org/D47332 + // hasn't landed yet, so use intrinsic first here. + // TODO: Should use SMAX directly once SMAX patch landed + Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; + if (VT == MVT::v2i64) + BifID = Intrinsic::ppc_altivec_vmaxsd; + else if (VT == MVT::v8i16) + BifID = Intrinsic::ppc_altivec_vmaxsh; + else if (VT == MVT::v16i8) + BifID = Intrinsic::ppc_altivec_vmaxsb; + + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9555,10 +9657,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); - case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -9635,6 +9737,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; + case ISD::BITCAST: + // Don't handle bitcast here. + return; } } @@ -9798,17 +9903,14 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, return BB; } -MachineBasicBlock * -PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, - MachineBasicBlock *BB, - bool is8bit, // operation - unsigned BinOpcode, - unsigned CmpOpcode, - unsigned CmpPred) const { +MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( + MachineInstr &MI, MachineBasicBlock *BB, + bool is8bit, // operation + unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const { // If we support part-word atomic mnemonics, just use them if (Subtarget.hasPartwordAtomics()) - return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, - CmpOpcode, CmpPred); + return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode, + CmpPred); // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -9832,7 +9934,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = - CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; + CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loopMBB); if (CmpOpcode) @@ -9843,22 +9945,25 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass - : &PPC::GPRCRegClass; + const TargetRegisterClass *RC = + is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); unsigned ShiftReg = - isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); - unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); - unsigned MaskReg = RegInfo.createVirtualRegister(RC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); - unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); + unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); unsigned Ptr1Reg; - unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); + unsigned TmpReg = + (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC); // thisMBB: // ... @@ -9887,82 +9992,107 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) - .addReg(ptrA).addReg(ptrB); + .addReg(ptrA) + .addReg(ptrB); } else { Ptr1Reg = ptrB; } - BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) - .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + // We need use 32-bit subregister to avoid mismatch register class in 64-bit + // mode. + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) + .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0) + .addImm(3) + .addImm(27) + .addImm(is8bit ? 28 : 27); if (!isLittleEndian) - BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) - .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) + .addReg(Shift1Reg) + .addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) - .addReg(Ptr1Reg).addImm(0).addImm(61); + .addReg(Ptr1Reg) + .addImm(0) + .addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) - .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); - BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) - .addReg(incr).addReg(ShiftReg); + .addReg(Ptr1Reg) + .addImm(0) + .addImm(0) + .addImm(29); + BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); - BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); + BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) + .addReg(Mask3Reg) + .addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) - .addReg(Mask2Reg).addReg(ShiftReg); + .addReg(Mask2Reg) + .addReg(ShiftReg); BB = loopMBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) - .addReg(ZeroReg).addReg(PtrReg); + .addReg(ZeroReg) + .addReg(PtrReg); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) - .addReg(Incr2Reg).addReg(TmpDestReg); - BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) - .addReg(TmpDestReg).addReg(MaskReg); - BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) - .addReg(TmpReg).addReg(MaskReg); + .addReg(Incr2Reg) + .addReg(TmpDestReg); + BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) + .addReg(TmpDestReg) + .addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg); if (CmpOpcode) { // For unsigned comparisons, we can directly compare the shifted values. // For signed comparisons we shift and sign extend. - unsigned SReg = RegInfo.createVirtualRegister(RC); - BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg) - .addReg(TmpDestReg).addReg(MaskReg); + unsigned SReg = RegInfo.createVirtualRegister(GPRC); + BuildMI(BB, dl, TII->get(PPC::AND), SReg) + .addReg(TmpDestReg) + .addReg(MaskReg); unsigned ValueReg = SReg; unsigned CmpReg = Incr2Reg; if (CmpOpcode == PPC::CMPW) { - ValueReg = RegInfo.createVirtualRegister(RC); + ValueReg = RegInfo.createVirtualRegister(GPRC); BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) - .addReg(SReg).addReg(ShiftReg); - unsigned ValueSReg = RegInfo.createVirtualRegister(RC); + .addReg(SReg) + .addReg(ShiftReg); + unsigned ValueSReg = RegInfo.createVirtualRegister(GPRC); BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) - .addReg(ValueReg); + .addReg(ValueReg); ValueReg = ValueSReg; CmpReg = incr; } BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) - .addReg(CmpReg).addReg(ValueReg); + .addReg(CmpReg) + .addReg(ValueReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); + .addImm(CmpPred) + .addReg(PPC::CR0) + .addMBB(exitMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(exitMBB); BB = loop2MBB; } - BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) - .addReg(Tmp3Reg).addReg(Tmp2Reg); + BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)) - .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); + .addReg(Tmp4Reg) + .addReg(ZeroReg) + .addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; - BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) - .addReg(ShiftReg); + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) + .addReg(TmpDestReg) + .addReg(ShiftReg); return BB; } @@ -9979,10 +10109,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); - unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); @@ -10045,10 +10171,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) - .addReg(PPC::X2) - .addImm(TOCOffset) - .addReg(BufReg); - MIB.setMemRefs(MMOBegin, MMOEnd); + .addReg(PPC::X2) + .addImm(TOCOffset) + .addReg(BufReg) + .cloneMemRefs(MI); } // Naked functions never have a base pointer, and so we use r1. For all @@ -10063,8 +10189,8 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) .addReg(BaseReg) .addImm(BPOffset) - .addReg(BufReg); - MIB.setMemRefs(MMOBegin, MMOEnd); + .addReg(BufReg) + .cloneMemRefs(MI); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); @@ -10097,8 +10223,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, .addImm(LabelOffset) .addReg(BufReg); } - - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.cloneMemRefs(MI); BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); mainMBB->addSuccessor(sinkMBB); @@ -10122,10 +10247,6 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); - MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -10163,7 +10284,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, .addImm(0) .addReg(BufReg); } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.cloneMemRefs(MI); // Reload IP if (PVT == MVT::i64) { @@ -10175,7 +10296,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, .addImm(LabelOffset) .addReg(BufReg); } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.cloneMemRefs(MI); // Reload SP if (PVT == MVT::i64) { @@ -10187,7 +10308,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, .addImm(SPOffset) .addReg(BufReg); } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.cloneMemRefs(MI); // Reload BP if (PVT == MVT::i64) { @@ -10199,16 +10320,15 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, .addImm(BPOffset) .addReg(BufReg); } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.cloneMemRefs(MI); // Reload TOC if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) - .addImm(TOCOffset) - .addReg(BufReg); - - MIB.setMemRefs(MMOBegin, MMOEnd); + .addImm(TOCOffset) + .addReg(BufReg) + .cloneMemRefs(MI); } // Jump @@ -10232,7 +10352,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // way to mark the dependence as implicit there, and so the stackmap code // will confuse it with a regular operand. Instead, add the dependence // here. - setUsesTOCBasePtr(*BB->getParent()); MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); } @@ -10257,8 +10376,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction *F = BB->getParent(); if (MI.getOpcode() == PPC::SELECT_CC_I4 || - MI.getOpcode() == PPC::SELECT_CC_I8 || - MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { + MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 || + MI.getOpcode() == PPC::SELECT_I8) { SmallVector<MachineOperand, 2> Cond; if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8) @@ -10403,9 +10522,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) - .addReg(HiReg).addReg(ReadAgainReg); + .addReg(HiReg) + .addReg(ReadAgainReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); + .addImm(PPC::PRED_NE) + .addReg(CmpReg) + .addMBB(readMBB); BB->addSuccessor(readMBB); BB->addSuccessor(sinkMBB); @@ -10575,27 +10697,35 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // st[bhwd]cx. dest, ptr // exitBB: BB = loop1MBB; - BuildMI(BB, dl, TII->get(LoadMnemonic), dest) - .addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) - .addReg(oldval).addReg(dest); + .addReg(oldval) + .addReg(dest); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) - .addReg(newval).addReg(ptrA).addReg(ptrB); + .addReg(newval) + .addReg(ptrA) + .addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) - .addReg(dest).addReg(ptrA).addReg(ptrB); + .addReg(dest) + .addReg(ptrA) + .addReg(ptrB); BB->addSuccessor(exitMBB); // exitMBB: @@ -10630,24 +10760,26 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass - : &PPC::GPRCRegClass; + const TargetRegisterClass *RC = + is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); unsigned ShiftReg = - isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); - unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); - unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); - unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); - unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); - unsigned MaskReg = RegInfo.createVirtualRegister(RC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); + unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC); + unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC); + unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); unsigned Ptr1Reg; - unsigned TmpReg = RegInfo.createVirtualRegister(RC); + unsigned TmpReg = RegInfo.createVirtualRegister(GPRC); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... @@ -10684,74 +10816,107 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) - .addReg(ptrA).addReg(ptrB); + .addReg(ptrA) + .addReg(ptrB); } else { Ptr1Reg = ptrB; } - BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) - .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + + // We need use 32-bit subregister to avoid mismatch register class in 64-bit + // mode. + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) + .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0) + .addImm(3) + .addImm(27) + .addImm(is8bit ? 28 : 27); if (!isLittleEndian) - BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) - .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) + .addReg(Shift1Reg) + .addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) - .addReg(Ptr1Reg).addImm(0).addImm(61); + .addReg(Ptr1Reg) + .addImm(0) + .addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) - .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); + .addReg(Ptr1Reg) + .addImm(0) + .addImm(0) + .addImm(29); BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) - .addReg(newval).addReg(ShiftReg); + .addReg(newval) + .addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) - .addReg(oldval).addReg(ShiftReg); + .addReg(oldval) + .addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) - .addReg(Mask3Reg).addImm(65535); + .addReg(Mask3Reg) + .addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) - .addReg(Mask2Reg).addReg(ShiftReg); + .addReg(Mask2Reg) + .addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) - .addReg(NewVal2Reg).addReg(MaskReg); + .addReg(NewVal2Reg) + .addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) - .addReg(OldVal2Reg).addReg(MaskReg); + .addReg(OldVal2Reg) + .addReg(MaskReg); BB = loop1MBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) - .addReg(ZeroReg).addReg(PtrReg); - BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) - .addReg(TmpDestReg).addReg(MaskReg); + .addReg(ZeroReg) + .addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::AND), TmpReg) + .addReg(TmpDestReg) + .addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) - .addReg(TmpReg).addReg(OldVal3Reg); + .addReg(TmpReg) + .addReg(OldVal3Reg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; - BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) - .addReg(TmpDestReg).addReg(MaskReg); - BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) - .addReg(Tmp2Reg).addReg(NewVal3Reg); - BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) - .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) + .addReg(TmpDestReg) + .addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg) + .addReg(Tmp2Reg) + .addReg(NewVal3Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)) + .addReg(Tmp4Reg) + .addReg(ZeroReg) + .addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; - BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) - .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::STWCX)) + .addReg(TmpDestReg) + .addReg(ZeroReg) + .addReg(PtrReg); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; - BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) - .addReg(ShiftReg); + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) + .addReg(TmpReg) + .addReg(ShiftReg); } else if (MI.getOpcode() == PPC::FADDrtz) { // This pseudo performs an FADD with rounding mode temporarily forced // to round-to-zero. We emit this via custom inserter since the FPSCR @@ -10788,9 +10953,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? - &PPC::GPRCRegClass : - &PPC::G8RCRegClass); + unsigned Dest = RegInfo.createVirtualRegister( + Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); DebugLoc dl = MI.getDebugLoc(); BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) @@ -11242,9 +11406,8 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. - KnownBits Op1Known, Op2Known; - DAG.computeKnownBits(N->getOperand(0), Op1Known); - DAG.computeKnownBits(N->getOperand(1), Op2Known); + KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0)); + KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1)); // We don't really care about what is known about the first bit (if // anything), so clear it in all masks prior to comparing them. @@ -11761,6 +11924,45 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, ShiftCst); } +SDValue PPCTargetLowering::combineSetCC(SDNode *N, + DAGCombinerInfo &DCI) const { + assert(N->getOpcode() == ISD::SETCC && + "Should be called with a SETCC node"); + + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETNE || CC == ISD::SETEQ) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // If there is a '0 - y' pattern, canonicalize the pattern to the RHS. + if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && + LHS.hasOneUse()) + std::swap(LHS, RHS); + + // x == 0-y --> x+y == 0 + // x != 0-y --> x+y != 0 + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && + RHS.hasOneUse()) { + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + EVT OpVT = LHS.getValueType(); + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); + } + } + + return DAGCombineTruncBoolExt(N, DCI); +} + +// Is this an extending load from an f32 to an f64? +static bool isFPExtLoad(SDValue Op) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode())) + return LD->getExtensionType() == ISD::EXTLOAD && + Op.getValueType() == MVT::f64; + return false; +} + /// Reduces the number of fp-to-int conversion when building a vector. /// /// If this vector is built out of floating to integer conversions, @@ -11795,11 +11997,18 @@ combineElementTruncationToVectorTruncation(SDNode *N, SmallVector<SDValue, 4> Ops; EVT TargetVT = N->getValueType(0); for (int i = 0, e = N->getNumOperands(); i < e; ++i) { - if (N->getOperand(i).getOpcode() != PPCISD::MFVSR) + SDValue NextOp = N->getOperand(i); + if (NextOp.getOpcode() != PPCISD::MFVSR) return SDValue(); - unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode(); + unsigned NextConversion = NextOp.getOperand(0).getOpcode(); if (NextConversion != FirstConversion) return SDValue(); + // If we are converting to 32-bit integers, we need to add an FP_ROUND. + // This is not valid if the input was originally double precision. It is + // also not profitable to do unless this is an extending load in which + // case doing this combine will allow us to combine consecutive loads. + if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0))) + return SDValue(); if (N->getOperand(i) != FirstInput) IsSplat = false; } @@ -11813,8 +12022,9 @@ combineElementTruncationToVectorTruncation(SDNode *N, // Now that we know we have the right type of node, get its operands for (int i = 0, e = N->getNumOperands(); i < e; ++i) { SDValue In = N->getOperand(i).getOperand(0); - // For 32-bit values, we need to add an FP_ROUND node. if (Is32Bit) { + // For 32-bit values, we need to add an FP_ROUND node (if we made it + // here, we know that all inputs are extending loads so this is safe). if (In.isUndef()) Ops.push_back(DAG.getUNDEF(SrcVT)); else { @@ -11864,7 +12074,8 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; } // Not a build vector of (possibly fp_rounded) loads. - if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) + if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) || + N->getNumOperands() == 1) return SDValue(); for (int i = 1, e = N->getNumOperands(); i < e; ++i) { @@ -11991,10 +12202,15 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { auto isSExtOfVecExtract = [&](SDValue Op) -> bool { if (!Op) return false; - if (Op.getOpcode() != ISD::SIGN_EXTEND) + if (Op.getOpcode() != ISD::SIGN_EXTEND && + Op.getOpcode() != ISD::SIGN_EXTEND_INREG) return false; + // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value + // of the right width. SDValue Extract = Op.getOperand(0); + if (Extract.getOpcode() == ISD::ANY_EXTEND) + Extract = Extract.getOperand(0); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; @@ -12082,8 +12298,10 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, return Reduced; // If we're building a vector out of extended elements from another vector - // we have P9 vector integer extend instructions. - if (Subtarget.hasP9Altivec()) { + // we have P9 vector integer extend instructions. The code assumes legal + // input types (i.e. it can't handle things like v4i16) so do not run before + // legalization. + if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) { Reduced = combineBVOfVecSExt(N, DAG); if (Reduced) return Reduced; @@ -12438,6 +12656,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: break; + case ISD::ADD: + return combineADD(N, DCI); case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: @@ -12464,7 +12684,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: return DAGCombineExtBoolTrunc(N, DCI); case ISD::TRUNCATE: + return combineTRUNCATE(N, DCI); case ISD::SETCC: + if (SDValue CSCC = combineSetCC(N, DCI)) + return CSCC; + LLVM_FALLTHROUGH; case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); case ISD::SINT_TO_FP: @@ -12487,9 +12711,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, (Op1VT == MVT::i32 || Op1VT == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) { - // STBRX can only handle simple types. + // STBRX can only handle simple types and it makes no sense to store less + // two bytes in byte-reversed order. EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); - if (mVT.isExtended()) + if (mVT.isExtended() || mVT.getSizeInBits() < 16) break; SDValue BSwapOp = N->getOperand(1).getOperand(0); @@ -12865,6 +13090,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } } + + // Combine vmaxsw/h/b(a, a's negation) to abs(a) + // Expose the vabsduw/h/b opportunity for down stream + if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() && + (IID == Intrinsic::ppc_altivec_vmaxsw || + IID == Intrinsic::ppc_altivec_vmaxsh || + IID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = N->getOperand(1); + SDValue V2 = N->getOperand(2); + if ((V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8) && + V1.getSimpleValueType() == V2.getSimpleValueType()) { + // (0-a, a) + if (V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2) { + return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2); + } + // (a, 0-a) + if (V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + // (x-y, y-x) + if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB && + V1.getOperand(0) == V2.getOperand(1) && + V1.getOperand(1) == V2.getOperand(0)) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + } + } } break; @@ -13097,6 +13355,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + case ISD::ABS: + return combineABS(N, DCI); + case ISD::VSELECT: + return combineVSelect(N, DCI); } return SDValue(); @@ -13239,7 +13501,8 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const { } else if (Constraint == "wc") { // individual CR bits. return C_RegisterClass; } else if (Constraint == "wa" || Constraint == "wd" || - Constraint == "wf" || Constraint == "ws") { + Constraint == "wf" || Constraint == "ws" || + Constraint == "wi") { return C_RegisterClass; // VSX registers. } return TargetLowering::getConstraintType(Constraint); @@ -13269,6 +13532,8 @@ PPCTargetLowering::getSingleConstraintMatchWeight( return CW_Register; else if (StringRef(constraint) == "ws" && type->isDoubleTy()) return CW_Register; + else if (StringRef(constraint) == "wi" && type->isIntegerTy(64)) + return CW_Register; // just hold 64-bit integers data. switch (*constraint) { default: @@ -13351,7 +13616,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // An individual CR bit. return std::make_pair(0U, &PPC::CRBITRCRegClass); } else if ((Constraint == "wa" || Constraint == "wd" || - Constraint == "wf") && Subtarget.hasVSX()) { + Constraint == "wf" || Constraint == "wi") && + Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); } else if (Constraint == "ws" && Subtarget.hasVSX()) { if (VT == MVT::f32 && Subtarget.hasP8Vector()) @@ -13586,6 +13852,35 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, report_fatal_error("Invalid register name global variable"); } +bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { + // 32-bit SVR4 ABI access everything as got-indirect. + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + return true; + + CodeModel::Model CModel = getTargetMachine().getCodeModel(); + // If it is small or large code model, module locals are accessed + // indirectly by loading their address from .toc/.got. The difference + // is that for large code model we have ADDISTocHa + LDtocL and for + // small code model we simply have LDtoc. + if (CModel == CodeModel::Small || CModel == CodeModel::Large) + return true; + + // JumpTable and BlockAddress are accessed as got-indirect. + if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA)) + return true; + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { + const GlobalValue *GV = G->getGlobal(); + unsigned char GVFlags = Subtarget.classifyGlobalReference(GV); + // The NLP flag indicates that a global access has to use an + // extra indirection. + if (GVFlags & PPCII::MO_NLP_FLAG) + return true; + } + + return false; +} + bool PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The PowerPC target isn't yet aware of offsets. @@ -14104,7 +14399,30 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; - return SDValue(); + SDValue N0 = N->getOperand(0); + ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Subtarget.isISA3_0() || + N0.getOpcode() != ISD::SIGN_EXTEND || + N0.getOperand(0).getValueType() != MVT::i32 || + CN1 == nullptr || N->getValueType(0) != MVT::i64) + return SDValue(); + + // We can't save an operation here if the value is already extended, and + // the existing shift is easier to combine. + SDValue ExtsSrc = N0.getOperand(0); + if (ExtsSrc.getOpcode() == ISD::TRUNCATE && + ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext) + return SDValue(); + + SDLoc DL(N0); + SDValue ShiftBy = SDValue(CN1, 0); + // We want the shift amount to be i32 on the extswli, but the shift could + // have an i64. + if (ShiftBy.getValueType() == MVT::i64) + ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32); + + return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0), + ShiftBy); } SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { @@ -14121,6 +14439,152 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { return SDValue(); } +// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1)) +// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0)) +// When C is zero, the equation (addi Z, -C) can be simplified to Z +// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types +static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + if (!Subtarget.isPPC64()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + auto isZextOfCompareWithConstant = [](SDValue Op) { + if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() || + Op.getValueType() != MVT::i64) + return false; + + SDValue Cmp = Op.getOperand(0); + if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() || + Cmp.getOperand(0).getValueType() != MVT::i64) + return false; + + if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) { + int64_t NegConstant = 0 - Constant->getSExtValue(); + // Due to the limitations of the addi instruction, + // -C is required to be [-32768, 32767]. + return isInt<16>(NegConstant); + } + + return false; + }; + + bool LHSHasPattern = isZextOfCompareWithConstant(LHS); + bool RHSHasPattern = isZextOfCompareWithConstant(RHS); + + // If there is a pattern, canonicalize a zext operand to the RHS. + if (LHSHasPattern && !RHSHasPattern) + std::swap(LHS, RHS); + else if (!LHSHasPattern && !RHSHasPattern) + return SDValue(); + + SDLoc DL(N); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); + SDValue Cmp = RHS.getOperand(0); + SDValue Z = Cmp.getOperand(0); + auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1)); + + assert(Constant && "Constant Should not be a null pointer."); + int64_t NegConstant = 0 - Constant->getSExtValue(); + + switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) { + default: break; + case ISD::SETNE: { + // when C == 0 + // --> addze X, (addic Z, -1).carry + // / + // add X, (zext(setne Z, C))-- + // \ when -32768 <= -C <= 32767 && C != 0 + // --> addze X, (addic (addi Z, -C), -1).carry + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, + DAG.getConstant(NegConstant, DL, MVT::i64)); + SDValue AddOrZ = NegConstant != 0 ? Add : Z; + SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64)); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue(Addc.getNode(), 1)); + } + case ISD::SETEQ: { + // when C == 0 + // --> addze X, (subfic Z, 0).carry + // / + // add X, (zext(sete Z, C))-- + // \ when -32768 <= -C <= 32767 && C != 0 + // --> addze X, (subfic (addi Z, -C), 0).carry + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, + DAG.getConstant(NegConstant, DL, MVT::i64)); + SDValue AddOrZ = NegConstant != 0 ? Add : Z; + SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + DAG.getConstant(0, DL, MVT::i64), AddOrZ); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue(Subc.getNode(), 1)); + } + } + + return SDValue(); +} + +SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) + return Value; + + return SDValue(); +} + +// Detect TRUNCATE operations on bitcasts of float128 values. +// What we are looking for here is the situtation where we extract a subset +// of bits from a 128 bit float. +// This can be of two forms: +// 1) BITCAST of f128 feeding TRUNCATE +// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE +// The reason this is required is because we do not have a legal i128 type +// and so we want to prevent having to store the f128 and then reload part +// of it. +SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, + DAGCombinerInfo &DCI) const { + // If we are using CRBits then try that first. + if (Subtarget.useCRBits()) { + // Check if CRBits did anything and return that if it did. + if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI)) + return CRTruncValue; + } + + SDLoc dl(N); + SDValue Op0 = N->getOperand(0); + + // Looking for a truncate of i128 to i64. + if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64) + return SDValue(); + + int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0; + + // SRL feeding TRUNCATE. + if (Op0.getOpcode() == ISD::SRL) { + ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); + // The right shift has to be by 64 bits. + if (!ConstNode || ConstNode->getZExtValue() != 64) + return SDValue(); + + // Switch the element number to extract. + EltToExtract = EltToExtract ? 0 : 1; + // Update Op0 past the SRL. + Op0 = Op0.getOperand(0); + } + + // BITCAST feeding a TRUNCATE possibly via SRL. + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getValueType() == MVT::i128 && + Op0.getOperand(0).getValueType() == MVT::f128) { + SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0)); + return DCI.DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast, + DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32)); + } + return SDValue(); +} + bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) @@ -14156,6 +14620,15 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } +bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const { + if (!Subtarget.hasVSX()) + return false; + if (Subtarget.hasP9Vector() && VT == MVT::f128) + return true; + return VT == MVT::f32 || VT == MVT::f64 || + VT == MVT::v4f32 || VT == MVT::v2f64; +} + bool PPCTargetLowering:: isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { const Value *Mask = AndI.getOperand(1); @@ -14172,3 +14645,109 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { // For non-constant masks, we can always use the record-form and. return true; } + +// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32 +SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); + assert(Subtarget.hasP9Altivec() && + "Only combine this when P9 altivec supported!"); + EVT VT = N->getValueType(0); + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + if (N->getOperand(0).getOpcode() == ISD::SUB) { + // Even for signed integers, if it's known to be positive (as signed + // integer) due to zero-extended inputs. + unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); + unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); + if ((SubOpcd0 == ISD::ZERO_EXTEND || + SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && + (SubOpcd1 == ISD::ZERO_EXTEND || + SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + // For type v4i32, it can be optimized with xvnegsp + vabsduw + if (N->getOperand(0).getValueType() == MVT::v4i32 && + N->getOperand(0).hasOneUse()) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(1, dl, MVT::i32)); + } + } + + return SDValue(); +} + +// For type v4i32/v8ii16/v16i8, transform +// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b) +// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b) +// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b) +// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b) +SDValue PPCTargetLowering::combineVSelect(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here"); + assert(Subtarget.hasP9Altivec() && + "Only combine this when P9 altivec supported!"); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue TrueOpnd = N->getOperand(1); + SDValue FalseOpnd = N->getOperand(2); + EVT VT = N->getOperand(1).getValueType(); + + if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB || + FalseOpnd.getOpcode() != ISD::SUB) + return SDValue(); + + // ABSD only available for type v4i32/v8i16/v16i8 + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + // At least to save one more dependent computation + if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse())) + return SDValue(); + + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Can only handle unsigned comparison here + switch (CC) { + default: + return SDValue(); + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETULT: + case ISD::SETULE: + std::swap(TrueOpnd, FalseOpnd); + break; + } + + SDValue CmpOpnd1 = Cond.getOperand(0); + SDValue CmpOpnd2 = Cond.getOperand(1); + + // SETCC CmpOpnd1 CmpOpnd2 cond + // TrueOpnd = CmpOpnd1 - CmpOpnd2 + // FalseOpnd = CmpOpnd2 - CmpOpnd1 + if (TrueOpnd.getOperand(0) == CmpOpnd1 && + TrueOpnd.getOperand(1) == CmpOpnd2 && + FalseOpnd.getOperand(0) == CmpOpnd2 && + FalseOpnd.getOperand(1) == CmpOpnd1) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(), + CmpOpnd1, CmpOpnd2, + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + return SDValue(); +} |
