diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 2403 |
1 files changed, 1935 insertions, 468 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index cf738cd66434..287e2e60e572 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -210,6 +210,8 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) @@ -284,6 +286,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); // Vector reductions @@ -292,6 +296,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); + setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -341,6 +349,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); // No native support for these. setOperationAction(ISD::FDIV, VT, Expand); @@ -358,6 +370,17 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { } } + // Custom Expand smaller than legal vector reductions to prevent false zero + // items being added. + setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -717,13 +740,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); setOperationAction(ISD::BITCAST, MVT::i16, Custom); - setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::bf16, &ARM::HPRRegClass); + setAllExpand(MVT::bf16); + if (!Subtarget->hasFullFP16()) + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -771,6 +800,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + + if (Subtarget->hasBF16()) { + addQRTypeForNEON(MVT::v8bf16); + addDRTypeForNEON(MVT::v4bf16); + } } if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { @@ -912,9 +946,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::v4f32, Expand); } - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); @@ -938,10 +969,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::VECREDUCE_ADD); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::BITCAST); + } + if (Subtarget->hasMVEIntegerOps()) { + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::FP_EXTEND); } if (!Subtarget->hasFP64()) { @@ -1356,6 +1401,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } + + // Strict floating-point comparisons need custom lowering. + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } // Use __sincos_stret if available. @@ -1413,12 +1466,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } if (Subtarget->hasNEON()) { - // vmin and vmax aren't available in a scalar form, so we use - // a NEON instruction with an undef lane instead. - setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); + // vmin and vmax aren't available in a scalar form, so we can use + // a NEON instruction with an undef lane instead. This has a performance + // penalty on some cores, so we don't do this unless we have been + // asked to by the core tuning model. + if (Subtarget->useNEONForSinglePrecisionFP()) { + setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); + } setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); @@ -1446,6 +1503,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); + if (Subtarget->hasMVEIntegerOps()) + setTargetDAGCombine(ISD::VSELECT); + if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); if (Subtarget->isThumb1Only()) @@ -1544,17 +1604,21 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tSECALL: return "ARMISD::tSECALL"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::SERET_FLAG: return "ARMISD::SERET_FLAG"; case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; @@ -1605,6 +1669,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; case ARMISD::VCMP: return "ARMISD::VCMP"; case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; @@ -1645,8 +1710,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMOVN: return "ARMISD::VMOVN"; + case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; + case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; + case ARMISD::VCVTN: return "ARMISD::VCVTN"; + case ARMISD::VCVTL: return "ARMISD::VCVTL"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::VADDVs: return "ARMISD::VADDVs"; + case ARMISD::VADDVu: return "ARMISD::VADDVu"; + case ARMISD::VADDLVs: return "ARMISD::VADDLVs"; + case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; + case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; + case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; + case ARMISD::VADDLVps: return "ARMISD::VADDLVps"; + case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu"; + case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps"; + case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; + case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; + case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; + case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; + case ARMISD::VMLALVu: return "ARMISD::VMLALVu"; + case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs"; + case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; @@ -1950,6 +2035,35 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, } } +SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, SDValue Val) const { + Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), + Val); + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); + } else { + Val = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); + } + return Val; +} + +SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, + SDValue Val) const { + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVrh, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } else { + Val = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } + return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( @@ -1977,7 +2091,8 @@ SDValue ARMTargetLowering::LowerCallResult( } SDValue Val; - if (VA.needsCustom()) { + if (VA.needsCustom() && + (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); @@ -2026,6 +2141,13 @@ SDValue ARMTargetLowering::LowerCallResult( break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); + InVals.push_back(Val); } @@ -2092,22 +2214,34 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; + bool isCmseNSCall = false; bool PreferIndirect = false; + // Determine whether this is a non-secure function call. + if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) + isCmseNSCall = true; + // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall()) isTailCall = false; + // For both the non-secure calls and the returns from a CMSE entry function, + // the function needs to do some extra work afte r the call, or before the + // return, respectively, thus it cannot end with atail call + if (isCmseNSCall || AFI->isCmseNSEntryFunction()) + isTailCall = false; + if (isa<GlobalAddressSDNode>(Callee)) { // If we're optimizing for minimum size and the function is called three or // more times in this block, we can improve codesize by calling indirectly // as BLXr has a 16-bit encoding. auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - if (CLI.CS) { - auto *BB = CLI.CS.getParent(); + if (CLI.CB) { + auto *BB = CLI.CB->getParent(); PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && count_if(GV->users(), [&BB](const User *U) { return isa<Instruction>(U) && @@ -2121,7 +2255,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, PreferIndirect); - if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically @@ -2182,31 +2316,50 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; } - // f64 and v2f64 might be passed in i32 pairs and must be split into pieces - if (VA.needsCustom()) { - if (VA.getLocVT() == MVT::v2f64) { - SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, dl, MVT::i32)); - SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(1, dl, MVT::i32)); - - PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, - VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isRegLoc()) { - PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, - VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - } else { - assert(VA.isMemLoc()); + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); + } else { + // f16 arguments could have been extended prior to argument lowering. + // Mask them arguments if this is a CMSE nonsecure call. + auto ArgVT = Outs[realArgIdx].ArgVT; + if (isCmseNSCall && (ArgVT == MVT::f16)) { + auto LocBits = VA.getLocVT().getSizeInBits(); + auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); + SDValue Mask = + DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); + Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + } + } - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, - dl, DAG, VA, Flags)); - } - } else { - PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, dl, MVT::i32)); + + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back( + LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { @@ -2217,7 +2370,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isThisReturn = true; } const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { @@ -2240,9 +2393,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); - SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, - MachinePointerInfo(), - DAG.InferPtrAlignment(AddArg)); + SDValue Load = + DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), + DAG.InferPtrAlign(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } @@ -2263,8 +2416,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); - SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, - MVT::i32); + SDValue AlignNode = + DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; @@ -2306,7 +2459,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { @@ -2322,7 +2474,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2336,7 +2488,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2388,7 +2540,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2400,10 +2552,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } + if (isCmseNSCall) { + assert(!isARMFunc && !isDirect && + "Cannot handle call to ARM function or direct call"); + if (NumBytes > 0) { + DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), + "call to non-secure function would " + "require passing arguments on stack", + dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + if (isStructRet) { + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "call to non-secure function would return value through pointer", + dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + } + // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { - if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + if (isCmseNSCall) + CallOpc = ARMISD::tSECALL; + else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = ARMISD::CALL; @@ -2463,6 +2636,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -2483,15 +2657,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, - unsigned Align) const { + Align Alignment) const { // Byval (as with any stack) slots are always at least 4 byte aligned. - Align = std::max(Align, 4U); + Alignment = std::max(Alignment, Align(4)); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; - unsigned AlignInRegs = Align / 4; + unsigned AlignInRegs = Alignment.value() / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); @@ -2630,9 +2804,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Check that the call results are passed in the same way. LLVMContext &C = *DAG.getContext(); - if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, - CCAssignFnForReturn(CalleeCC, isVarArg), - CCAssignFnForReturn(CallerCC, isVarArg))) + if (!CCState::resultsCompatible( + getEffectiveCallingConv(CalleeCC, isVarArg), + getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, + CCAssignFnForReturn(CalleeCC, isVarArg), + CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) return false; // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -2673,7 +2849,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; - if (VA.needsCustom()) { + if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure @@ -2772,6 +2948,17 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); AFI->setReturnRegsCount(RVLocs.size()); + // Report error if cmse entry function returns structure through first ptr arg. + if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { + // Note: using an empty SDLoc(), as the first line of the function is a + // better place to report than the last line. + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "secure entry function would return value through pointer", + SDLoc().getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); @@ -2814,7 +3001,24 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, break; } - if (VA.needsCustom()) { + // Mask f16 arguments if this is a CMSE nonsecure entry. + auto RetVT = Outs[realRVLocIdx].ArgVT; + if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { + if (VA.needsCustom() && VA.getValVT() == MVT::f16) { + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); + } else { + auto LocBits = VA.getLocVT().getSizeInBits(); + auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); + SDValue Mask = + DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); + Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + } + } + + if (VA.needsCustom() && + (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, @@ -2822,15 +3026,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 0 : 1), - Flag); + Chain = + DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 1 : 0), - Flag); + Chain = + DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc @@ -2844,22 +3048,20 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 0 : 1), - Flag); + fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 1 : 0), - Flag); + fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), - ReturnF16 ? MVT::f16 : VA.getLocVT())); + RetOps.push_back(DAG.getRegister( + VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = @@ -2893,7 +3095,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return LowerInterruptReturn(RetOps, dl, DAG); } - return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); + ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : + ARMISD::RET_FLAG; + return DAG.getNode(RetNode, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -3035,11 +3239,10 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, } if (CP->isMachineConstantPoolEntry()) - Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, - CP->getAlignment()); + Res = + DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, - CP->getAlignment()); + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } @@ -3058,14 +3261,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); if (!IsPositionIndependent) { - CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad( @@ -3194,8 +3397,9 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, const auto *GA = cast<GlobalAddressSDNode>(Op); auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); SDValue Offset = DAG.getLoad( - PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, - DAG.getTargetConstantPool(CPV, PtrVT, 4)), + PtrVT, DL, Chain, + DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, + DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); @@ -3214,7 +3418,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); - SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), Argument, @@ -3265,7 +3469,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3283,7 +3487,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3386,11 +3590,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, // that are strings for simplicity. auto *CDAInit = dyn_cast<ConstantDataArray>(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); - unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); + Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); - if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || + if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || Size == 0) return SDValue(); @@ -3429,8 +3633,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, } auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); - SDValue CPAddr = - DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); + SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { AFI->markGlobalAsPromotedToConstantPool(GVar); AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + @@ -3500,7 +3703,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { // use literal pool for address constant ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); RelAddr = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3520,7 +3723,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { - SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3631,7 +3834,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID( unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); SDValue ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); - std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; + constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; SDValue Callee = DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); SDValue RegisterMask = DAG.getRegisterMask(Mask); @@ -3715,7 +3918,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3777,6 +3980,15 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_mve_pred_v2i: return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); + case Intrinsic::arm_mve_vreinterpretq: + return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::arm_mve_lsll: + return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::arm_mve_asrl: + return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -3977,6 +4189,42 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, AFI->setVarArgsFrameIndex(FrameIndex); } +bool ARMTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { + bool IsABIRegCopy = CC.hasValue(); + EVT ValueVT = Val.getValueType(); + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); + Parts[0] = Val; + return true; + } + return false; +} + +SDValue ARMTargetLowering::joinRegisterPartsIntoValue( + SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { + bool IsABIRegCopy = CC.hasValue(); + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + SDValue Val = Parts[0]; + + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + return Val; + } + return SDValue(); +} + SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -4049,44 +4297,41 @@ SDValue ARMTargetLowering::LowerFormalArguments( if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - if (VA.needsCustom()) { + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. - if (VA.getLocVT() == MVT::v2f64) { - SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - VA = ArgLocs[++i]; // skip ahead to next loc - SDValue ArgValue2; - if (VA.isMemLoc()) { - int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FI)); - } else { - ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - } - ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue1, - DAG.getIntPtrConstant(0, dl)); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue2, - DAG.getIntPtrConstant(1, dl)); - } else - ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + SDValue ArgValue1 = + GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, + ArgValue1, DAG.getIntPtrConstant(0, dl)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, + ArgValue2, DAG.getIntPtrConstant(1, dl)); + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; - - if (RegVT == MVT::f16) + if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; - else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) + else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || + RegVT == MVT::v4bf16) RC = &ARM::DPRRegClass; - else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) + else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || + RegVT == MVT::v8bf16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass @@ -4126,6 +4371,13 @@ SDValue ARMTargetLowering::LowerFormalArguments( break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); + InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check @@ -4349,13 +4601,16 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const SDLoc &dl) const { + SelectionDAG &DAG, const SDLoc &dl, + bool Signaling) const { assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; if (!isFloatingPointZero(RHS)) - Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); + Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, + dl, MVT::Glue, LHS, RHS); else - Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); + Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, + dl, MVT::Glue, LHS); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } @@ -4541,7 +4796,7 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { EVT VT = Op.getValueType(); - if (!Subtarget->hasDSP()) + if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) return SDValue(); if (!VT.isSimple()) return SDValue(); @@ -5413,7 +5668,12 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { // FIXME: Remove this when we have strict fp instruction selection patterns if (IsStrict) { - DAG.mutateStrictFPToFP(Op.getNode()); + SDLoc Loc(Op); + SDValue Result = + DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT + : ISD::FP_TO_UINT, + Loc, Op.getValueType(), SrcVal); + return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); } return Op; @@ -5696,85 +5956,27 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { +SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); - // This function is only supposed to be called for i64 types, either as the - // source or destination of the bit convert. + // This function is only supposed to be called for i16 and i64 types, either + // as the source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - const bool HasFullFP16 = Subtarget->hasFullFP16(); - - if (SrcVT == MVT::f32 && DstVT == MVT::i32) { - // FullFP16: half values are passed in S-registers, and we don't - // need any of the bitcast and moves: - // - // t2: f32,ch = CopyFromReg t0, Register:f32 %0 - // t5: i32 = bitcast t2 - // t18: f16 = ARMISD::VMOVhr t5 - if (Op.getOpcode() != ISD::CopyFromReg || - Op.getValueType() != MVT::f32) - return SDValue(); - - auto Move = N->use_begin(); - if (Move->getOpcode() != ARMISD::VMOVhr) - return SDValue(); - - SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; - SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); - DAG.ReplaceAllUsesWith(*Move, &Copy); - return Copy; - } - - if (SrcVT == MVT::i16 && DstVT == MVT::f16) { - if (!HasFullFP16) - return SDValue(); - // SoftFP: read half-precision arguments: - // - // t2: i32,ch = ... - // t7: i16 = truncate t2 <~~~~ Op - // t8: f16 = bitcast t7 <~~~~ N - // - if (Op.getOperand(0).getValueType() == MVT::i32) - return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), - MVT::f16, Op.getOperand(0)); - return SDValue(); - } + if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && + (DstVT == MVT::f16 || DstVT == MVT::bf16)) + return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), + DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); - // Half-precision return values - if (SrcVT == MVT::f16 && DstVT == MVT::i16) { - if (!HasFullFP16) - return SDValue(); - // - // t11: f16 = fadd t8, t10 - // t12: i16 = bitcast t11 <~~~ SDNode N - // t13: i32 = zero_extend t12 - // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 - // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 - // - // transform this into: - // - // t20: i32 = ARMISD::VMOVrh t11 - // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 - // - auto ZeroExtend = N->use_begin(); - if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || - ZeroExtend->getValueType(0) != MVT::i32) - return SDValue(); - - auto Copy = ZeroExtend->use_begin(); - if (Copy->getOpcode() == ISD::CopyToReg && - Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { - SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); - DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); - return Cvt; - } - return SDValue(); - } + if ((DstVT == MVT::i16 || DstVT == MVT::i32) && + (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) + return DAG.getNode( + ISD::TRUNCATE, SDLoc(N), DstVT, + MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); @@ -5917,16 +6119,20 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue Ops[] = { DAG.getEntryNode(), - DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; + SDValue Chain = Op.getOperand(0); + SDValue Ops[] = {Chain, + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; - SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); + SDValue FPSCR = + DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); + Chain = FPSCR.getValue(1); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); + SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); + return DAG.getMergeValues({And, Chain}, dl); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, @@ -6411,9 +6617,10 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { /// immediate" operand (e.g., VMOV). If so, return the encoded value. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - const SDLoc &dl, EVT &VT, bool is128Bits, + const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type) { unsigned OpCmode, Imm; + bool is128Bits = VectorVT.is128BitVector(); // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified @@ -6531,9 +6738,18 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getDataLayout().isBigEndian()) - // swap higher and lower 32 bit word - Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); + if (DAG.getDataLayout().isBigEndian()) { + // Reverse the order of elements within the vector. + unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; + unsigned Mask = (1 << BytesPerElem) - 1; + unsigned NumElems = 8 / BytesPerElem; + unsigned NewImm = 0; + for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { + unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); + NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; + } + Imm = NewImm; + } // Op=1, Cmode=1110. OpCmode = 0x1e; @@ -6572,8 +6788,6 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, case MVT::f64: { SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); - if (!ST->isLittle()) - std::swap(Lo, Hi); return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); } case MVT::f32: @@ -6626,7 +6840,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), - VMovVT, false, VMOVModImm); + VMovVT, VT, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, @@ -6643,7 +6857,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Finally, try a VMVN.i32 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, - false, VMVNModImm); + VT, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); @@ -7051,6 +7265,104 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { return true; } +// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted +// from a pair of inputs. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(Y, 0), +// FP_ROUND(EXTRACT_ELT(X, 1), +// FP_ROUND(EXTRACT_ELT(Y, 1), ...) +static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v8f16) + return SDValue(); + + // We are looking for a buildvector of fptrunc elements, where all the + // elements are interleavingly extracted from two sources. Check the first two + // items are valid enough and extract some info from them (they are checked + // properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || + BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); + if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (unsigned i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { + return Trunc.getOpcode() == ISD::FP_ROUND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) + return SDValue(); + if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) + return SDValue(); + } + + SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, + DAG.getConstant(1, dl, MVT::i32)); +} + +// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted +// from a single input on alternating lanes. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(X, 2), +// FP_ROUND(EXTRACT_ELT(X, 4), ...) +static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v4f32) + return SDValue(); + + // We are looking for a buildvector of fptext elements, where all the + // elements are alternating lanes from a single source. For example <0,2,4,6> + // or <1,3,5,7>. Check the first two items are valid enough and extract some + // info from them (they are checked properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); + if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (unsigned i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { + return Trunc.getOpcode() == ISD::FP_EXTEND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) + return SDValue(); + } + + return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, + DAG.getConstant(Offset, dl, MVT::i32)); +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7150,13 +7462,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getUNDEF(VT); if ((ST->hasNEON() && SplatBitSize <= 64) || - (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { + (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { // Check if an immediate VMOV works. EVT VmovVT; - SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - VMOVModImm); + SDValue Val = + isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); @@ -7166,9 +7477,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isVMOVModifiedImm( - NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); + NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, + VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -7308,12 +7618,19 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); - // Empirical tests suggest this is rarely worth it for vectors of length <= 2. - if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and + // vmovn). Empirical tests suggest this is rarely worth it for vectors of + // length <= 2. + if (NumElts >= 4) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; - } + + // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into + // VCVT's + if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) + return VCVT; + if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) + return VCVT; if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector @@ -7514,7 +7831,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); - Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } @@ -7566,7 +7883,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[1], Mask, DAG); if (!Shuffle) return SDValue(); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); } enum ShuffleOpCodes { @@ -8879,7 +9196,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + const Align StackAlign = DL.getPrefTypeAlign(RetTy); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); @@ -9054,8 +9371,7 @@ void ARMTargetLowering::ExpandDIV_Windows( DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); - Results.push_back(Lower); - Results.push_back(Upper); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); } static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { @@ -9100,8 +9416,9 @@ void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, SDValue Result = DAG.getMemIntrinsicNode( ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, - Result.getValue(0), Result.getValue(1)); + SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); + SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); Results.append({Pair, Result.getValue(2)}); } } @@ -9146,10 +9463,14 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, SDNode *N = Op.getNode(); SDLoc dl(N); - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), - DAG.getTargetConstant(0, dl, MVT::i32)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), - DAG.getTargetConstant(1, dl, MVT::i32)); + SDValue Lo = DAG.getNode( + ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, + MVT::i32)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, + MVT::i32)); return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), {ST->getChain(), Lo, Hi, ST->getBasePtr()}, @@ -9188,13 +9509,87 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; - if (!PassThru.isUndef() && - (PassThru.getOpcode() != ISD::BITCAST || - !isZeroVector(PassThru->getOperand(0)))) + bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || + PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && + isZeroVector(PassThru->getOperand(0)); + if (!PassThru.isUndef() && !PassThruIsCastZero) Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } +static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + SDLoc dl(Op); + unsigned BaseOpcode = 0; + switch (Op->getOpcode()) { + default: llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; + case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; + case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; + case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; + case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; + } + + SDValue Op0 = Op->getOperand(0); + EVT VT = Op0.getValueType(); + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumActiveLanes = NumElts; + + assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || + NumActiveLanes == 2) && + "Only expected a power 2 vector size"); + + // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements + // allows us to easily extract vector elements from the lanes. + while (NumActiveLanes > 4) { + unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; + SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); + Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); + NumActiveLanes /= 2; + } + + SDValue Res; + if (NumActiveLanes == 4) { + // The remaining 4 elements are summed sequentially + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); + SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); + SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); + } else { + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1, dl, MVT::i32)); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + } + + // Result type may be wider than element type. + if (EltVT != Op->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); + return Res; +} + +static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEFloatOps()) + return SDValue(); + return LowerVecReduce(Op, DAG, ST); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -9264,15 +9659,61 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, bool isBigEndian = DAG.getDataLayout().isBigEndian(); - Results.push_back( + SDValue Lo = DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); - Results.push_back( + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); + SDValue Hi = DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 2)); } +SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDValue Chain = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + + // If we don't have instructions of this float type then soften to a libcall + // and use SETCC instead. + if (isUnsupportedFloatingType(LHS.getValueType())) { + DAG.getTargetLoweringInfo().softenSetCCOperands( + DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, dl, LHS.getValueType()); + CC = ISD::SETNE; + } + SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, + DAG.getCondCode(CC)); + return DAG.getMergeValues({Result, Chain}, dl); + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit + // in CMPFP and CMPFPE, but instead it should be made explicit by these + // instructions using a chain instead of glue. This would also fix the problem + // here (and also in LowerSELECT_CC) where we generate two comparisons when + // CondCode2 != AL. + SDValue True = DAG.getConstant(1, dl, VT); + SDValue False = DAG.getConstant(0, dl, VT); + SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); + SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); + if (CondCode2 != ARMCC::AL) { + ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); + Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); + Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); + } + return DAG.getMergeValues({Result, Chain}, dl); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -9353,6 +9794,16 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTORE(Op, DAG, Subtarget); case ISD::MLOAD: return LowerMLOAD(Op, DAG); + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return LowerVecReduce(Op, DAG, Subtarget); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + return LowerVecReduceF(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -9366,6 +9817,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } @@ -9397,8 +9850,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, DAG.getVTList(MVT::i32, MVT::i32), N->getOperand(1), N->getOperand(2), Lo, Hi); - Results.push_back(LongMul.getValue(0)); - Results.push_back(LongMul.getValue(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, + LongMul.getValue(0), LongMul.getValue(1))); } /// ReplaceNodeResults - Replace the results of node with an illegal result @@ -9487,7 +9940,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); - unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); + unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; @@ -9495,11 +9948,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOStore, 4, 4); + MachineMemOperand::MOStore, 4, Align(4)); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -9685,7 +10138,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -9776,10 +10229,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) @@ -9816,8 +10267,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg3) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( - MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), + MachineMemOperand::MOLoad, 4, Align(4)); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -9877,10 +10329,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) @@ -9910,8 +10360,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( - MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), + MachineMemOperand::MOLoad, 4, Align(4)); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) @@ -10150,7 +10601,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, Register dest = MI.getOperand(0).getReg(); Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); - unsigned Align = MI.getOperand(3).getImm(); + unsigned Alignment = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); @@ -10163,17 +10614,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, bool IsThumb2 = Subtarget->isThumb2(); bool IsThumb = Subtarget->isThumb(); - if (Align & 1) { + if (Alignment & 1) { UnitSize = 1; - } else if (Align & 2) { + } else if (Alignment & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { - if ((Align % 16 == 0) && SizeVal >= 16) + if ((Alignment % 16 == 0) && SizeVal >= 16) UnitSize = 16; - else if ((Align % 8 == 0) && SizeVal >= 8) + else if ((Alignment % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. @@ -10279,13 +10730,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) @@ -11655,6 +12104,42 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } +static SDValue PerformVSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). + // + // We need to re-implement this optimization here as the implementation in the + // Target-Independent DAGCombiner does not handle the kind of constant we make + // (it calls isConstOrConstSplat with AllowTruncation set to false - and for + // good reason, allowing truncation there would break other targets). + // + // Currently, this is only done for MVE, as it's the only target that benefits + // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::XOR) + return SDValue(); + SDValue XOR = N->getOperand(0); + + // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. + // It is important to check with truncation allowed as the BUILD_VECTORs we + // generate in those situations will truncate their operands. + ConstantSDNode *Const = + isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, + /*AllowTruncation*/ true); + if (!Const || !Const->isOne()) + return SDValue(); + + // Rewrite into vselect(cond, rhs, lhs). + SDValue Cond = XOR->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT Type = N->getValueType(0); + return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); +} + static SDValue PerformABSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -11712,6 +12197,71 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } +static SDValue PerformADDVecReduce(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this + // will look like: + // t1: i32,i32 = ARMISD::VADDLVs x + // t2: i64 = build_pair t1, t1:1 + // t3: i64 = add t2, y + // We also need to check for sext / zext and commutitive adds. + auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, + SDValue NB) { + if (NB->getOpcode() != ISD::BUILD_PAIR) + return SDValue(); + SDValue VecRed = NB->getOperand(0); + if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 || + NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) + return SDValue(); + + SDLoc dl(N); + SmallVector<SDValue, 4> Ops; + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(0, dl, MVT::i32))); + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(1, dl, MVT::i32))); + for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) + Ops.push_back(VecRed->getOperand(i)); + SDValue Red = DCI.DAG.getNode(OpcodeA, dl, + DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); + return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, + SDValue(Red.getNode(), 1)); + }; + + if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) + return M; + return SDValue(); +} + bool ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { @@ -11883,6 +12433,9 @@ static SDValue PerformADDCombine(SDNode *N, if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; + if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget)) + return Result; + // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; @@ -11974,18 +12527,86 @@ static SDValue PerformVMULCombine(SDNode *N, DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } +static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto IsSignExt = [&](SDValue Op) { + if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) + return SDValue(); + EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); + if (VT.getScalarSizeInBits() == 32) + return Op->getOperand(0); + return SDValue(); + }; + auto IsZeroExt = [&](SDValue Op) { + // Zero extends are a little more awkward. At the point we are matching + // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. + // That might be before of after a bitcast depending on how the and is + // placed. Because this has to look through bitcasts, it is currently only + // supported on LE. + if (!Subtarget->isLittle()) + return SDValue(); + + SDValue And = Op; + if (And->getOpcode() == ISD::BITCAST) + And = And->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + SDValue Mask = And->getOperand(1); + if (Mask->getOpcode() == ISD::BITCAST) + Mask = Mask->getOperand(0); + + if (Mask->getOpcode() != ISD::BUILD_VECTOR || + Mask.getValueType() != MVT::v4i32) + return SDValue(); + if (isAllOnesConstant(Mask->getOperand(0)) && + isNullConstant(Mask->getOperand(1)) && + isAllOnesConstant(Mask->getOperand(2)) && + isNullConstant(Mask->getOperand(3))) + return And->getOperand(0); + return SDValue(); + }; + + SDLoc dl(N); + if (SDValue Op0 = IsSignExt(N0)) { + if (SDValue Op1 = IsSignExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); + } + } + if (SDValue Op0 = IsZeroExt(N0)) { + if (SDValue Op1 = IsZeroExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); + } + } + + return SDValue(); +} + static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) + return PerformMVEVMULLCombine(N, DAG, Subtarget); + if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) @@ -12170,20 +12791,21 @@ static SDValue PerformANDCombine(SDNode *N, EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; - if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || + VT == MVT::v8i1 || VT == MVT::v16i1) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize <= 64) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { EVT VbicVT; SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VbicVT, VT.is128BitVector(), - OtherModImm); + DAG, dl, VbicVT, VT, OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); @@ -12413,58 +13035,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) { }; } +static ARMCC::CondCodes getVCMPCondCode(SDValue N) { + if (N->getOpcode() == ARMISD::VCMP) + return (ARMCC::CondCodes)N->getConstantOperandVal(2); + else if (N->getOpcode() == ARMISD::VCMPZ) + return (ARMCC::CondCodes)N->getConstantOperandVal(1); + else + llvm_unreachable("Not a VCMP/VCMPZ!"); +} + +static bool CanInvertMVEVCMP(SDValue N) { + ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); + return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); +} + static SDValue PerformORCombine_i1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain // together with predicates EVT VT = N->getValueType(0); + SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ARMCC::CondCodes CondCode0 = ARMCC::AL; - ARMCC::CondCodes CondCode1 = ARMCC::AL; - if (N0->getOpcode() == ARMISD::VCMP) - CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) - ->getZExtValue(); - else if (N0->getOpcode() == ARMISD::VCMPZ) - CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) - ->getZExtValue(); - if (N1->getOpcode() == ARMISD::VCMP) - CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) - ->getZExtValue(); - else if (N1->getOpcode() == ARMISD::VCMPZ) - CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) - ->getZExtValue(); - - if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) - return SDValue(); - - unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); - unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); + auto IsFreelyInvertable = [&](SDValue V) { + if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) + return CanInvertMVEVCMP(V); + return false; + }; - if (!isValidMVECond(Opposite0, - N0->getOperand(0)->getValueType(0).isFloatingPoint()) || - !isValidMVECond(Opposite1, - N1->getOperand(0)->getValueType(0).isFloatingPoint())) + // At least one operand must be freely invertable. + if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) return SDValue(); - SmallVector<SDValue, 4> Ops0; - Ops0.push_back(N0->getOperand(0)); - if (N0->getOpcode() == ARMISD::VCMP) - Ops0.push_back(N0->getOperand(1)); - Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); - SmallVector<SDValue, 4> Ops1; - Ops1.push_back(N1->getOperand(0)); - if (N1->getOpcode() == ARMISD::VCMP) - Ops1.push_back(N1->getOperand(1)); - Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); - - SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); - SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); - SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); - return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, - DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); + SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT); + SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT); + SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); + return DCI.DAG.getLogicalNOT(DL, And, VT); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR @@ -12480,17 +13088,21 @@ static SDValue PerformORCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + return PerformORCombine_i1(N, DCI, Subtarget); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize <= 64) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { EVT VorrVT; - SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VorrVT, VT.is128BitVector(), - OtherModImm); + SDValue Val = + isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); @@ -12551,10 +13163,6 @@ static SDValue PerformORCombine(SDNode *N, } } - if (Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) - return PerformORCombine_i1(N, DCI, Subtarget); - // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { @@ -12586,6 +13194,27 @@ static SDValue PerformXORCombine(SDNode *N, return Result; } + if (Subtarget->hasMVEIntegerOps()) { + // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + const TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->isConstTrueVal(N1.getNode()) && + (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { + if (CanInvertMVEVCMP(N0)) { + SDLoc DL(N0); + ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); + + SmallVector<SDValue, 4> Ops; + Ops.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops.push_back(N0->getOperand(1)); + Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32)); + return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); + } + } + } + return SDValue(); } @@ -12784,6 +13413,78 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + + // VMOVhr (VMOVrh (X)) -> X + if (Op0->getOpcode() == ARMISD::VMOVrh) + return Op0->getOperand(0); + + // FullFP16: half values are passed in S-registers, and we don't + // need any of the bitcast and moves: + // + // t2: f32,ch = CopyFromReg t0, Register:f32 %0 + // t5: i32 = bitcast t2 + // t18: f16 = ARMISD::VMOVhr t5 + if (Op0->getOpcode() == ISD::BITCAST) { + SDValue Copy = Op0->getOperand(0); + if (Copy.getValueType() == MVT::f32 && + Copy->getOpcode() == ISD::CopyFromReg) { + SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; + SDValue NewCopy = + DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); + return NewCopy; + } + } + + // fold (VMOVhr (load x)) -> (load (f16*)x) + if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { + if (LN0->hasOneUse() && LN0->isUnindexed() && + LN0->getMemoryVT() == MVT::i16) { + SDValue Load = + DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), + LN0->getBasePtr(), LN0->getMemOperand()); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); + DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); + return Load; + } + } + + // Only the bottom 16 bits of the source register are used. + APInt DemandedMask = APInt::getLowBitsSet(32, 16); + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + +static SDValue PerformVMOVrhCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (VMOVrh (load x)) -> (zextload (i16*)x) + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0); + + SDValue Load = + DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), + LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); + DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); + return Load; + } + + // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) + if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(N0->getOperand(1))) + return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), + N0->getOperand(1)); + + return SDValue(); +} + /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded @@ -12934,8 +13635,29 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // If the valuetypes are the same, we can remove the cast entirely. if (Op->getOperand(0).getValueType() == VT) return Op->getOperand(0); - return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, - Op->getOperand(0).getValueType(), Op->getOperand(0)); + return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); + } + + return SDValue(); +} + +static SDValue +PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST + if (ST->isLittle()) + return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) + if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { + // If the valuetypes are the same, we can remove the cast entirely. + if (Op->getOperand(0).getValueType() == VT) + return Op->getOperand(0); + return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); } return SDValue(); @@ -13000,6 +13722,29 @@ static SDValue PerformInsertEltCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } +static SDValue PerformExtractEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // extract (vdup x) -> x + if (Op0->getOpcode() == ARMISD::VDUP) { + SDValue X = Op0->getOperand(0); + if (VT == MVT::f16 && X.getValueType() == MVT::i32) + return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); + if (VT == MVT::i32 && X.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); + + while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) + X = X->getOperand(0); + if (X.getValueType() == VT) + return X; + } + + return SDValue(); +} + /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { @@ -13281,6 +14026,128 @@ static SDValue PerformVLDCombine(SDNode *N, return CombineBaseUpdate(N, DCI); } +static SDValue PerformMVEVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue Addr = N->getOperand(2); + MemSDNode *MemN = cast<MemSDNode>(N); + SDLoc dl(N); + + // For the stores, where there are multiple intrinsics we only actually want + // to post-inc the last of the them. + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_mve_vst2q && + cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) + return SDValue(); + if (IntNo == Intrinsic::arm_mve_vst4q && + cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) + return SDValue(); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. We can avoid searching through Addr as it's a + // predecessor to both. + SmallPtrSet<const SDNode *, 32> Visited; + SmallVector<const SDNode *, 16> Worklist; + Visited.insert(Addr.getNode()); + Worklist.push_back(N); + Worklist.push_back(User); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || + SDNode::hasPredecessorHelper(User, Visited, Worklist)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + switch (IntNo) { + default: + llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); + case Intrinsic::arm_mve_vld2q: + NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; + break; + case Intrinsic::arm_mve_vld4q: + NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; + break; + case Intrinsic::arm_mve_vst2q: + NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; + isLoadOp = false; + break; + case Intrinsic::arm_mve_vst4q: + NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; + isLoadOp = false; + break; + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else { + VecTy = N->getOperand(3).getValueType(); + } + + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); + if (!CInc || CInc->getZExtValue() != NumBytes) + continue; + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + + // Then, gather the new node's operands. + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(2)); // ptr + Ops.push_back(Inc); + + for (unsigned i = 3; i < N->getNumOperands(); ++i) + Ops.push_back(N->getOperand(i)); + + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, + MemN->getMemOperand()); + + // Update the uses. + SmallVector<SDValue, 5> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + + return SDValue(); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -13365,8 +14232,21 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // On MVE, we just convert the VDUPLANE to a VDUP with an extract. + if (Subtarget->hasMVEIntegerOps()) { + EVT ExtractVT = VT.getVectorElementType(); + // We need to ensure we are creating a legal type. + if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) + ExtractVT = MVT::i32; + SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, + N->getOperand(0), N->getOperand(1)); + return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); + } // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. @@ -13387,7 +14267,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N, unsigned EltBits; if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; - EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); @@ -13400,6 +14279,18 @@ static SDValue PerformVDUPCombine(SDNode *N, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + SDLoc dl(N); + + if (Subtarget->hasMVEIntegerOps()) { + // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will + // need to come from a GPR. + if (Op.getValueType() == MVT::f32) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); + else if (Op.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); + } if (!Subtarget->hasNEON()) return SDValue(); @@ -13528,7 +14419,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ISD::TRUNCATE) + if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND) return SDValue(); EVT FromVT = Trunc->getOperand(0).getValueType(); EVT ToVT = Trunc.getValueType(); @@ -13543,20 +14434,54 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, NumElements = 4; if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) NumElements = 8; - if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16) + NumElements = 4; + if (NumElements == 0 || + (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); + // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so + // use the VMOVN over splitting the store. We are looking for patterns of: + // !rev: 0 N 1 N+1 2 N+2 ... + // rev: N 0 N+1 1 N+2 2 ... + auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) { + unsigned NumElts = ToVT.getVectorNumElements(); + if (NumElts != M.size()) + return false; + + unsigned Off0 = rev ? NumElts : 0; + unsigned Off1 = rev ? 0 : NumElts; + + for (unsigned i = 0; i < NumElts; i += 2) { + if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + return false; + if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + return false; + } + + return true; + }; + + if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0))) + if (isVMOVNOriginalMask(Shuffle->getMask(), false) || + isVMOVNOriginalMask(Shuffle->getMask(), true)) + return SDValue(); + + LLVMContext &C = *DAG.getContext(); SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - unsigned Alignment = St->getOriginalAlignment(); + Align Alignment = St->getOriginalAlign(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); - EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); - EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + // We split the store into slices of NumElements. fp16 trunc stores are vcvt + // and then stored as truncating integer stores. + EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); SmallVector<SDValue, 4> Stores; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { @@ -13566,9 +14491,17 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), DAG.getConstant(i * NumElements, DL, MVT::i32)); + + if (ToEltVT == MVT::f16) { + SDValue FPTrunc = + DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), + Extract, DAG.getConstant(0, DL, MVT::i32)); + Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); + } + SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), - NewToVT, Alignment, MMOFlags, AAInfo); + NewToVT, Alignment.value(), MMOFlags, AAInfo); Stores.push_back(Store); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); @@ -13766,8 +14699,163 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, ConvInput, DAG.getConstant(C, dl, MVT::i32)); } +static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + assert(N->getOpcode() == ISD::VECREDUCE_ADD); + EVT ResVT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDLoc dl(N); + + // We are looking for something that will have illegal types if left alone, + // but that we can convert to a single instruction undef MVE. For example + // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A + // or + // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B + + // Cases: + // VADDV u/s 8/16/32 + // VMLAV u/s 8/16/32 + // VADDLV u/s 32 + // VMLALV u/s 16/32 + + auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { + if (ResVT != RetTy || N0->getOpcode() != ExtendCode) + return SDValue(); + SDValue A = N0->getOperand(0); + if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return A; + return SDValue(); + }; + auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, + SDValue &A, SDValue &B) { + if (ResVT != RetTy || N0->getOpcode() != ISD::MUL) + return false; + SDValue ExtA = N0->getOperand(0); + SDValue ExtB = N0->getOperand(1); + if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) + return false; + A = ExtA->getOperand(0); + B = ExtB->getOperand(0); + if (A.getValueType() == B.getValueType() && + llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return true; + return false; + }; + auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { + SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, + SDValue(Node.getNode(), 1)); + }; + + if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); + if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); + if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) + return Create64bitNode(ARMISD::VADDLVs, {A}); + if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) + return Create64bitNode(ARMISD::VADDLVu, {A}); + + SDValue A, B; + if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) + return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); + if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) + return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); + if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) + return Create64bitNode(ARMISD::VMLALVs, {A, B}); + if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) + return Create64bitNode(ARMISD::VMLALVu, {A, B}); + return SDValue(); +} + +static SDValue PerformVMOVNCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + unsigned IsTop = N->getConstantOperandVal(2); + + // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) + // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) + if ((Op1->getOpcode() == ARMISD::VQMOVNs || + Op1->getOpcode() == ARMISD::VQMOVNu) && + Op1->getConstantOperandVal(2) == 0) + return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), + Op0, Op1->getOperand(1), N->getOperand(2)); + + // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from + // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting + // into the top or bottom lanes. + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); + APInt Op0DemandedElts = + IsTop ? Op1DemandedElts + : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + +static SDValue PerformVQMOVNCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + unsigned IsTop = N->getConstantOperandVal(2); + + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + APInt Op0DemandedElts = + APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) + : APInt::getHighBitsSet(2, 1)); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + return SDValue(); +} + +static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from + // uses of the intrinsics. + if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { + int ShiftAmt = C->getSExtValue(); + if (ShiftAmt == 0) { + SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); + DAG.ReplaceAllUsesWith(N, Merge.getNode()); + return SDValue(); + } + + if (ShiftAmt >= -32 && ShiftAmt < 0) { + unsigned NewOpcode = + N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; + SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, + DAG.getConstant(-ShiftAmt, DL, MVT::i32)); + DAG.ReplaceAllUsesWith(N, NewShift.getNode()); + return NewShift; + } + } + + return SDValue(); +} + /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. -static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { +SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: @@ -13916,6 +15004,72 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; + + case Intrinsic::arm_mve_vqdmlah: + case Intrinsic::arm_mve_vqdmlash: + case Intrinsic::arm_mve_vqrdmlah: + case Intrinsic::arm_mve_vqrdmlash: + case Intrinsic::arm_mve_vmla_n_predicated: + case Intrinsic::arm_mve_vmlas_n_predicated: + case Intrinsic::arm_mve_vqdmlah_predicated: + case Intrinsic::arm_mve_vqdmlash_predicated: + case Intrinsic::arm_mve_vqrdmlah_predicated: + case Intrinsic::arm_mve_vqrdmlash_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they return. So we don't need + // any bits of that operand above that point, which allows us to eliminate + // uxth/sxth. + unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) + return SDValue(); + break; + } + + case Intrinsic::arm_mve_minv: + case Intrinsic::arm_mve_maxv: + case Intrinsic::arm_mve_minav: + case Intrinsic::arm_mve_maxav: + case Intrinsic::arm_mve_minv_predicated: + case Intrinsic::arm_mve_maxv_predicated: + case Intrinsic::arm_mve_minav_predicated: + case Intrinsic::arm_mve_maxav_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they take as the other input. + unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) + return SDValue(); + break; + } + + case Intrinsic::arm_mve_addv: { + // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, + // which allow PerformADDVecReduce to turn it into VADDLV when possible. + bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); + } + + case Intrinsic::arm_mve_addlv: + case Intrinsic::arm_mve_addlv_predicated: { + // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR + // which recombines the two outputs into an i64 + bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? + (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : + (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); + + SmallVector<SDValue, 4> Ops; + for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) + if (i != 2) // skip the unsigned flag + Ops.push_back(N->getOperand(i)); + + SDLoc dl(N); + SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), + val.getValue(1)); + } } return SDValue(); @@ -14011,9 +15165,10 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } -// Look for a sign/zero extend of a larger than legal load. This can be split -// into two extending loads, which are simpler to deal with than an arbitrary -// sign extend. +// Look for a sign/zero/fpextend extend of a larger than legal load. This can be +// split into multiple extending loads, which are simpler to deal with than an +// arbitrary extend. For fp extends we use an integer extending load and a VCVTL +// to convert the type to an f32. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) @@ -14035,45 +15190,63 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { NumElements = 4; if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) NumElements = 8; + if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) + NumElements = 4; if (NumElements == 0 || - FromVT.getVectorNumElements() == NumElements || + (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0 || !isPowerOf2_32(NumElements)) return SDValue(); + LLVMContext &C = *DAG.getContext(); SDLoc DL(LD); // Details about the old load SDValue Ch = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); - unsigned Alignment = LD->getOriginalAlignment(); + Align Alignment = LD->getOriginalAlign(); MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); AAMDNodes AAInfo = LD->getAAInfo(); ISD::LoadExtType NewExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); - EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); - EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); - unsigned NewOffset = NewFromVT.getSizeInBits() / 8; - SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); - - // Split the load in half, each side of which is extended separately. This - // is good enough, as legalisation will take it from there. They are either - // already legal or they will be split further into something that is - // legal. - SDValue NewLoad1 = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, - LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); - SDValue NewLoad2 = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, - LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment, MMOFlags, AAInfo); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(NewLoad1.getNode(), 1), - SDValue(NewLoad2.getNode(), 1)); + EVT NewFromVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); + + SmallVector<SDValue, 4> Loads; + SmallVector<SDValue, 4> Chains; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue NewLoad = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment.value(), MMOFlags, AAInfo); + Loads.push_back(NewLoad); + Chains.push_back(SDValue(NewLoad.getNode(), 1)); + } + + // Float truncs need to extended with VCVTB's into their floating point types. + if (FromEltVT == MVT::f16) { + SmallVector<SDValue, 4> Extends; + + for (unsigned i = 0; i < Loads.size(); i++) { + SDValue LoadBC = + DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); + SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, + DAG.getConstant(0, DL, MVT::i32)); + Extends.push_back(FPExt); + } + + Loads = Extends; + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, @@ -14121,6 +15294,116 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (ST->hasMVEFloatOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + + return SDValue(); +} + +/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating +/// saturates. +static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + if (VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + + auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { + // Check one is a smin and the other is a smax + if (Min->getOpcode() != ISD::SMIN) + std::swap(Min, Max); + if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 15) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 7) - 1, true); + + APInt MinC, MaxC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || + MaxC != ~SaturateC) + return false; + return true; + }; + + if (IsSignedSaturate(N, N0.getNode())) { + SDLoc DL(N); + MVT ExtVT, HalfVT; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtVT = MVT::v4i16; + } else { // if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtVT = MVT::v8i8; + } + + // Create a VQMOVNB with undef top lanes, then signed extended into the top + // half. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), + N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, + DAG.getValueType(ExtVT)); + } + + auto IsUnsignedSaturate = [&](SDNode *Min) { + // For unsigned, we just need to check for <= 0xffff + if (Min->getOpcode() != ISD::UMIN) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 16) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 8) - 1, true); + + APInt MinC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + return true; + }; + + if (IsUnsignedSaturate(N)) { + SDLoc DL(N); + MVT HalfVT; + unsigned ExtConst; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtConst = 0x0000FFFF; + } else { //if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtConst = 0x00FF; + } + + // Create a VQMOVNB with undef top lanes, then ZExt into the top half with + // an AND. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::AND, DL, VT, Bitcast, + DAG.getConstant(ExtConst, DL, VT)); + } + + return SDValue(); +} + static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); if (!C) @@ -14602,10 +15885,41 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { return Res; } +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue Src = N->getOperand(0); + EVT DstVT = N->getValueType(0); + + // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. + if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) + return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); + } + + // We may have a bitcast of something that has already had this bitcast + // combine performed on it, so skip past any VECTOR_REG_CASTs. + while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) + Src = Src.getOperand(0); + + // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that + // would be generated is at least the width of the element type. + EVT SrcVT = Src.getValueType(); + if ((Src.getOpcode() == ARMISD::VMOVIMM || + Src.getOpcode() == ARMISD::VMVNIMM || + Src.getOpcode() == ARMISD::VMOVFPIMM) && + SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && + DAG.getDataLayout().isBigEndian()) + return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); + + return SDValue(); +} + SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); @@ -14623,25 +15937,37 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); + case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI); case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); - case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::ANY_EXTEND: + return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::FP_EXTEND: + return PerformFPExtendCombine(N, DCI.DAG, Subtarget); + case ISD::SMIN: + case ISD::UMIN: + case ISD::SMAX: + case ISD::UMAX: + return PerformMinMaxCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); @@ -14652,10 +15978,25 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); + case ISD::BITCAST: + return PerformBITCASTCombine(N, DCI.DAG, Subtarget); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); + case ARMISD::VECTOR_REG_CAST: + return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); + case ISD::VECREDUCE_ADD: + return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); + case ARMISD::VMOVN: + return PerformVMOVNCombine(N, DCI); + case ARMISD::VQMOVNs: + case ARMISD::VQMOVNu: + return PerformVQMOVNCombine(N, DCI); + case ARMISD::ASRL: + case ARMISD::LSRL: + case ARMISD::LSLL: + return PerformLongShiftCombine(N, DCI.DAG); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -14744,6 +16085,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: + return PerformMVEVLDCombine(N, DCI); default: break; } break; @@ -14827,28 +16173,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, return false; } -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); -} EVT ARMTargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... - if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; - if (Size >= 16 && - (memOpAlign(SrcAlign, DstAlign, 16) || + if (Op.size() >= 16 && + (Op.isAligned(Align(16)) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::v2f64; - } else if (Size >= 8 && - (memOpAlign(SrcAlign, DstAlign, 8) || + } else if (Op.size() >= 8 && + (Op.isAligned(Align(8)) || (allowsMisalignedMemoryAccesses( MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { @@ -14962,45 +16301,97 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, if (!Subtarget->hasMVEIntegerOps()) return false; - auto IsSinker = [](Instruction *I, int Operand) { + auto IsFMSMul = [&](Instruction *I) { + if (!I->hasOneUse()) + return false; + auto *Sub = cast<Instruction>(*I->users().begin()); + return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; + }; + auto IsFMS = [&](Instruction *I) { + if (match(I->getOperand(0), m_FNeg(m_Value())) || + match(I->getOperand(1), m_FNeg(m_Value()))) + return true; + return false; + }; + + auto IsSinker = [&](Instruction *I, int Operand) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::Mul: + case Instruction::FAdd: case Instruction::ICmp: + case Instruction::FCmp: return true; + case Instruction::FMul: + return !IsFMSMul(I); case Instruction::Sub: + case Instruction::FSub: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: return Operand == 1; + case Instruction::Call: + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + return !IsFMS(I); + default: + return false; + } + } + return false; default: return false; } }; - int Op = 0; - if (!isa<ShuffleVectorInst>(I->getOperand(Op))) - Op = 1; - if (!IsSinker(I, Op)) - return false; - if (!match(I->getOperand(Op), - m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_Zero()))) { - return false; - } - Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Shuffle->uses()) { - Instruction *Insn = cast<Instruction>(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; + for (auto OpIdx : enumerate(I->operands())) { + Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + Instruction *Shuffle = Op; + if (Shuffle->getOpcode() == Instruction::BitCast) + Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); + // We are looking for a splat that can be sunk. + if (!Shuffle || + !match(Shuffle, m_Shuffle( + m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + if (!IsSinker(I, OpIdx.index())) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast<Instruction>(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + if (Shuffle != Op) + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); } - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(Op)); return true; } +Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { + if (!Subtarget->hasMVEIntegerOps()) + return nullptr; + Type *SVIType = SVI->getType(); + Type *ScalarType = SVIType->getScalarType(); + + if (ScalarType->isFloatTy()) + return Type::getInt32Ty(SVIType->getContext()); + if (ScalarType->isHalfTy()) + return Type::getInt16Ty(SVIType->getContext()); + return nullptr; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -15012,6 +16403,9 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } + if (Subtarget->hasMVEIntegerOps()) + return true; + // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no @@ -15433,7 +16827,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, return false; } -static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, +static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { @@ -15468,16 +16862,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, // (in BE/masked) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { - if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) + if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) return true; } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; - } else if (Align >= 4 && + } else if (Alignment >= 4 && (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; - else if (Align >= 2 && + else if (Alignment >= 2 && (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; @@ -15499,28 +16893,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, EVT VT; SDValue Ptr; - unsigned Align; + Align Alignment; bool isSEXTLoad = false; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); IsMasked = true; } else return false; @@ -15529,9 +16923,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, - IsMasked, Subtarget->isLittle(), Base, - Offset, isInc, DAG); + getMVEIndexedAddressParts( + Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, + Subtarget->isLittle(), Base, Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, @@ -15557,31 +16951,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; - unsigned Align; + Align Alignment; bool isSEXTLoad = false, isNonExt; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); isNonExt = !ST->isTruncatingStore(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); isNonExt = !ST->isTruncatingStore(); IsMasked = true; } else @@ -15607,7 +17001,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, + getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { @@ -15722,18 +17116,23 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { - Known = Known.zext(DstSz, true /* extended bits are known zero */); + Known = Known.zext(DstSz); } assert(DstSz == Known.getBitWidth()); break; } + case ARMISD::VMOVrh: { + KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + assert(KnownOp.getBitWidth() == 16); + Known = KnownOp.zext(32); + break; + } } } -bool -ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, - const APInt &DemandedAPInt, - TargetLoweringOpt &TLO) const { +bool ARMTargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { // Delay optimization, so we don't have to deal with illegal types, or block // optimizations. if (!TLO.LegalOps) @@ -15758,7 +17157,7 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, unsigned Mask = C->getZExtValue(); - unsigned Demanded = DemandedAPInt.getZExtValue(); + unsigned Demanded = DemandedBits.getZExtValue(); unsigned ShrunkMask = Mask & Demanded; unsigned ExpandedMask = Mask | ~Demanded; @@ -15813,6 +17212,35 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, return false; } +bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, + unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + case ARMISD::ASRL: + case ARMISD::LSRL: { + // If this is result 0 and the other result is unused, see if the demand + // bits allow us to shrink this long shift into a standard small shift in + // the opposite direction. + if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && + isa<ConstantSDNode>(Op->getOperand(2))) { + unsigned ShAmt = Op->getConstantOperandVal(2); + if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( + APInt::getAllOnesValue(32) << (32 - ShAmt))) + return TLO.CombineTo( + Op, TLO.DAG.getNode( + ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), + TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); + } + break; + } + } + + return TargetLowering::SimplifyDemandedBitsForTargetNode( + Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); +} //===----------------------------------------------------------------------===// // ARM Inline Assembly Support @@ -15823,7 +17251,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { if (!Subtarget->hasV6Ops()) return false; - InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); std::string AsmStr = IA->getAsmString(); SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -15831,7 +17259,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = AsmPieces[0]; + AsmStr = std::string(AsmPieces[0]); AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); @@ -16330,13 +17758,15 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + MaybeAlign Align = + cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); + SP = + DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); @@ -16373,6 +17803,18 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(!(DstSz == 32 && Subtarget->hasFP16()) && "With FP16, 16 to 32 conversion is legal!"); + // Converting from 32 -> 64 is valid if we have FP64. + if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { + // FIXME: Remove this when we have strict fp instruction selection patterns + if (IsStrict) { + SDLoc Loc(Op); + SDValue Result = DAG.getNode(ISD::FP_EXTEND, + Loc, Op.getValueType(), SrcVal); + return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); + } + return Op; + } + // Either we are converting from 16 -> 64, without FP16 and/or // FP.double-precision or without Armv8-fp. So we must do it in two // steps. @@ -16528,7 +17970,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); + Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -16569,7 +18011,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); + Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -16595,6 +18037,34 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + Type *VecTy = cast<StructType>(I.getType())->getElementType(1); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile loads with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + Type *VecTy = I.getArgOperand(1)->getType(); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile stores with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); @@ -16603,7 +18073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -16615,7 +18085,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -16849,7 +18319,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); - unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); + unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { @@ -16868,7 +18338,7 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const { } bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { - return !Subtarget->hasMinSize(); + return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, @@ -16962,7 +18432,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } bool ARMTargetLowering::isLegalInterleavedAccessType( - unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { + unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); @@ -17021,8 +18491,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - VectorType *VecTy = Shuffles[0]->getType(); - Type *EltTy = VecTy->getVectorElementType(); + auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); + Type *EltTy = VecTy->getElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); @@ -17037,8 +18507,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) - VecTy = - VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); IRBuilder<> Builder(LI); @@ -17048,15 +18517,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / NumLoads); + VecTy = FixedVectorType::get(VecTy->getElementType(), + VecTy->getNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace())); + BaseAddr, + VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); @@ -17081,8 +18550,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID LoadInts = Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; - Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace()); + Type *VecEltTy = + VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, VecEltTy}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); @@ -17102,9 +18571,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = - Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, - VecTy->getVectorNumElements() * Factor); + BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, + VecTy->getNumElements() * Factor); CallInst *VldN = createLoadIntrinsic(BaseAddr); @@ -17119,8 +18587,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, VectorType::get(SV->getType()->getVectorElementType(), - VecTy->getVectorNumElements())); + SubVec, + FixedVectorType::get(SV->getType()->getElementType(), VecTy)); SubVecs[SV].push_back(SubVec); } @@ -17172,13 +18640,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - VectorType *VecTy = SVI->getType(); - assert(VecTy->getVectorNumElements() % Factor == 0 && - "Invalid interleaved store"); + auto *VecTy = cast<FixedVectorType>(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned LaneLen = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -17200,12 +18667,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. - Type *IntVecTy = - VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); + auto *IntVecTy = + FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, LaneLen); + SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -17215,14 +18682,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( - SI->getPointerAddressSpace())); + BaseAddr, + SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); @@ -17252,7 +18719,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID StoreInts = Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; - Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( + Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( SI->getPointerAddressSpace()); Type *Tys[] = {EltPtrTy, SubVecTy}; Function *VstNFunc = @@ -17274,7 +18741,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); SmallVector<Value *, 4> Shuffles; @@ -17284,7 +18751,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -17301,7 +18768,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); } } @@ -17349,11 +18816,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, case HA_DOUBLE: return false; case HA_VECT64: - return VT->getBitWidth() == 64; + return VT->getPrimitiveSizeInBits().getFixedSize() == 64; case HA_VECT128: - return VT->getBitWidth() == 128; + return VT->getPrimitiveSizeInBits().getFixedSize() == 128; case HA_UNKNOWN: - switch (VT->getBitWidth()) { + switch (VT->getPrimitiveSizeInBits().getFixedSize()) { case 64: Base = HA_VECT64; return true; @@ -17372,7 +18839,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, /// Return the correct alignment for the current calling convention. Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const { - const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); + const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); if (!ArgTy->isVectorTy()) return ABITypeAlign; @@ -17399,18 +18866,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( return IsHA || IsIntArray; } -unsigned ARMTargetLowering::getExceptionPointerRegister( +Register ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; + return Subtarget->useSjLjEH() ? Register() : ARM::R0; } -unsigned ARMTargetLowering::getExceptionSelectorRegister( +Register ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; + return Subtarget->useSjLjEH() ? Register() : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { |