diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3005 |
1 files changed, 2425 insertions, 580 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d45a80057564a..85db14ab66feb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); -static cl::opt<bool> -EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); - // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. @@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Returns true if VT's elements occupy the lowest bit positions of its +/// associated register class without any intervening space. +/// +/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the +/// same register class, but only nxv8f16 can be treated as a packed vector. +static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { + assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal vector type!"); + return VT.isFixedLengthVector() || + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); @@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); + addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); + addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVE()) { @@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); + } + + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addRegisterClass(VT, &AArch64::ZPRRegClass); + } + for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); } for (auto VT : { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); + + for (auto VT : + { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, + MVT::nxv2f64 }) { + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } } // Compute derived properties from the register classes @@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); @@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); @@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Expand); } + // AArch64 doesn't have i32 MULH{S|U}. + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); + setOperationAction(ISD::CTPOP, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); @@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the + // custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + setOperationAction(ISD::STORE, MVT::v32i8, Custom); + setOperationAction(ISD::STORE, MVT::v16i16, Custom); + setOperationAction(ISD::STORE, MVT::v16f16, Custom); + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v8f32, Custom); + setOperationAction(ISD::STORE, MVT::v4f64, Custom); + setOperationAction(ISD::STORE, MVT::v4i64, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; @@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); + setIndexedLoadAction(im, MVT::bf16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); @@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); + setIndexedStoreAction(im, MVT::bf16, Legal); } // Trap. @@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + + setOperationAction(ISD::TRUNCATE, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + if (Subtarget->hasSVE()) + setOperationAction(ISD::VSCALE, MVT::i32, Custom); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // splat of 0 or undef) once vector selects supported in SVE codegen. See // D68877 for more details. for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { - if (isTypeLegal(VT)) + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + if (VT.getScalarType() == MVT::i1) + setOperationAction(ISD::SETCC, VT, Custom); + } } + + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + + for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { + if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } + + // NOTE: Currently this has to happen after computeRegisterProperties rather + // than the preferred option of combining it with the addRegisterClass call. + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + + // 64bit results can mean a bigger than NEON input. + for (auto VT : {MVT::v8i8, MVT::v4i16}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); + + // 128bit results imply a bigger than NEON input. + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) + setOperationAction(ISD::TRUNCATE, VT, Custom); + for (auto VT : {MVT::v8f16, MVT::v4f32}) + setOperationAction(ISD::FP_ROUND, VT, Expand); + } } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { } } +void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); +} + void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); @@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v4i32); } -EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, - EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, + LLVMContext &C, EVT VT) const { if (!VT.isVector()) return MVT::i32; + if (VT.isScalableVector()) + return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, } bool AArch64TargetLowering::targetShrinkDemandedConstant( - SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; @@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. - if (Demanded.countPopulation() == Size) + if (DemandedBits.countPopulation() == Size) return false; unsigned NewOpc; @@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( if (!C) return false; uint64_t Imm = C->getZExtValue(); - return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); + return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in @@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Same as above but handling LLTs instead. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( - LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; @@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. - Align <= 2 || + Alignment <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. @@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { +#define MAKE_CASE(V) \ + case V: \ + return #V; switch ((AArch64ISD::NodeType)Opcode) { - case AArch64ISD::FIRST_NUMBER: break; - case AArch64ISD::CALL: return "AArch64ISD::CALL"; - case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; - case AArch64ISD::ADR: return "AArch64ISD::ADR"; - case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; - case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; - case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; - case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; - case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; - case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; - case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; - case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; - case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; - case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; - case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; - case AArch64ISD::ADC: return "AArch64ISD::ADC"; - case AArch64ISD::SBC: return "AArch64ISD::SBC"; - case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; - case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; - case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; - case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; - case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; - case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; - case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; - case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; - case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::DUP: return "AArch64ISD::DUP"; - case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; - case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; - case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; - case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; - case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; - case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; - case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; - case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; - case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; - case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; - case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; - case AArch64ISD::BICi: return "AArch64ISD::BICi"; - case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; - case AArch64ISD::BSL: return "AArch64ISD::BSL"; - case AArch64ISD::NEG: return "AArch64ISD::NEG"; - case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; - case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; - case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; - case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; - case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; - case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; - case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; - case AArch64ISD::REV16: return "AArch64ISD::REV16"; - case AArch64ISD::REV32: return "AArch64ISD::REV32"; - case AArch64ISD::REV64: return "AArch64ISD::REV64"; - case AArch64ISD::EXT: return "AArch64ISD::EXT"; - case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; - case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; - case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; - case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; - case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; - case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; - case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; - case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; - case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; - case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; - case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; - case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; - case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; - case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; - case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; - case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; - case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; - case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; - case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; - case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; - case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; - case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; - case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; - case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; - case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; - case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; - case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; - case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED"; - case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED"; - case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED"; - case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED"; - case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED"; - case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED"; - case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED"; - case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N"; - case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N"; - case AArch64ISD::LASTA: return "AArch64ISD::LASTA"; - case AArch64ISD::LASTB: return "AArch64ISD::LASTB"; - case AArch64ISD::REV: return "AArch64ISD::REV"; - case AArch64ISD::TBL: return "AArch64ISD::TBL"; - case AArch64ISD::NOT: return "AArch64ISD::NOT"; - case AArch64ISD::BIT: return "AArch64ISD::BIT"; - case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; - case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; - case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; - case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; - case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; - case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; - case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; - case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; - case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; - case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; - case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; - case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; - case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; - case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; - case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; - case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; - case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; - case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; - case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; - case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; - case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; - case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; - case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; - case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; - case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; - case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; - case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; - case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; - case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; - case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; - case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; - case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; - case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; - case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; - case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; - case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; - case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; - case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; - case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; - case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; - case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; - case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; - case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; - case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; - case AArch64ISD::STG: return "AArch64ISD::STG"; - case AArch64ISD::STZG: return "AArch64ISD::STZG"; - case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; - case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; - case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; - case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; - case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; - case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; - case AArch64ISD::INSR: return "AArch64ISD::INSR"; - case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; - case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; - case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; - case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; - case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; - case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW"; - case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED"; - case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED"; - case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM"; - case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S"; - case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED"; - case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW"; - case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW"; - case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; - case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; - case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; - case AArch64ISD::SST1: return "AArch64ISD::SST1"; - case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; - case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; - case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; - case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; - case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; - case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; - case AArch64ISD::LDP: return "AArch64ISD::LDP"; - case AArch64ISD::STP: return "AArch64ISD::STP"; - } + case AArch64ISD::FIRST_NUMBER: + break; + MAKE_CASE(AArch64ISD::CALL) + MAKE_CASE(AArch64ISD::ADRP) + MAKE_CASE(AArch64ISD::ADR) + MAKE_CASE(AArch64ISD::ADDlow) + MAKE_CASE(AArch64ISD::LOADgot) + MAKE_CASE(AArch64ISD::RET_FLAG) + MAKE_CASE(AArch64ISD::BRCOND) + MAKE_CASE(AArch64ISD::CSEL) + MAKE_CASE(AArch64ISD::FCSEL) + MAKE_CASE(AArch64ISD::CSINV) + MAKE_CASE(AArch64ISD::CSNEG) + MAKE_CASE(AArch64ISD::CSINC) + MAKE_CASE(AArch64ISD::THREAD_POINTER) + MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::ADD_PRED) + MAKE_CASE(AArch64ISD::SDIV_PRED) + MAKE_CASE(AArch64ISD::UDIV_PRED) + MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1) + MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1) + MAKE_CASE(AArch64ISD::SHL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRL_MERGE_OP1) + MAKE_CASE(AArch64ISD::SRA_MERGE_OP1) + MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ADC) + MAKE_CASE(AArch64ISD::SBC) + MAKE_CASE(AArch64ISD::ADDS) + MAKE_CASE(AArch64ISD::SUBS) + MAKE_CASE(AArch64ISD::ADCS) + MAKE_CASE(AArch64ISD::SBCS) + MAKE_CASE(AArch64ISD::ANDS) + MAKE_CASE(AArch64ISD::CCMP) + MAKE_CASE(AArch64ISD::CCMN) + MAKE_CASE(AArch64ISD::FCCMP) + MAKE_CASE(AArch64ISD::FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMP) + MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::DUP) + MAKE_CASE(AArch64ISD::DUPLANE8) + MAKE_CASE(AArch64ISD::DUPLANE16) + MAKE_CASE(AArch64ISD::DUPLANE32) + MAKE_CASE(AArch64ISD::DUPLANE64) + MAKE_CASE(AArch64ISD::MOVI) + MAKE_CASE(AArch64ISD::MOVIshift) + MAKE_CASE(AArch64ISD::MOVIedit) + MAKE_CASE(AArch64ISD::MOVImsl) + MAKE_CASE(AArch64ISD::FMOV) + MAKE_CASE(AArch64ISD::MVNIshift) + MAKE_CASE(AArch64ISD::MVNImsl) + MAKE_CASE(AArch64ISD::BICi) + MAKE_CASE(AArch64ISD::ORRi) + MAKE_CASE(AArch64ISD::BSP) + MAKE_CASE(AArch64ISD::NEG) + MAKE_CASE(AArch64ISD::EXTR) + MAKE_CASE(AArch64ISD::ZIP1) + MAKE_CASE(AArch64ISD::ZIP2) + MAKE_CASE(AArch64ISD::UZP1) + MAKE_CASE(AArch64ISD::UZP2) + MAKE_CASE(AArch64ISD::TRN1) + MAKE_CASE(AArch64ISD::TRN2) + MAKE_CASE(AArch64ISD::REV16) + MAKE_CASE(AArch64ISD::REV32) + MAKE_CASE(AArch64ISD::REV64) + MAKE_CASE(AArch64ISD::EXT) + MAKE_CASE(AArch64ISD::VSHL) + MAKE_CASE(AArch64ISD::VLSHR) + MAKE_CASE(AArch64ISD::VASHR) + MAKE_CASE(AArch64ISD::VSLI) + MAKE_CASE(AArch64ISD::VSRI) + MAKE_CASE(AArch64ISD::CMEQ) + MAKE_CASE(AArch64ISD::CMGE) + MAKE_CASE(AArch64ISD::CMGT) + MAKE_CASE(AArch64ISD::CMHI) + MAKE_CASE(AArch64ISD::CMHS) + MAKE_CASE(AArch64ISD::FCMEQ) + MAKE_CASE(AArch64ISD::FCMGE) + MAKE_CASE(AArch64ISD::FCMGT) + MAKE_CASE(AArch64ISD::CMEQz) + MAKE_CASE(AArch64ISD::CMGEz) + MAKE_CASE(AArch64ISD::CMGTz) + MAKE_CASE(AArch64ISD::CMLEz) + MAKE_CASE(AArch64ISD::CMLTz) + MAKE_CASE(AArch64ISD::FCMEQz) + MAKE_CASE(AArch64ISD::FCMGEz) + MAKE_CASE(AArch64ISD::FCMGTz) + MAKE_CASE(AArch64ISD::FCMLEz) + MAKE_CASE(AArch64ISD::FCMLTz) + MAKE_CASE(AArch64ISD::SADDV) + MAKE_CASE(AArch64ISD::UADDV) + MAKE_CASE(AArch64ISD::SRHADD) + MAKE_CASE(AArch64ISD::URHADD) + MAKE_CASE(AArch64ISD::SMINV) + MAKE_CASE(AArch64ISD::UMINV) + MAKE_CASE(AArch64ISD::SMAXV) + MAKE_CASE(AArch64ISD::UMAXV) + MAKE_CASE(AArch64ISD::SMAXV_PRED) + MAKE_CASE(AArch64ISD::UMAXV_PRED) + MAKE_CASE(AArch64ISD::SMINV_PRED) + MAKE_CASE(AArch64ISD::UMINV_PRED) + MAKE_CASE(AArch64ISD::ORV_PRED) + MAKE_CASE(AArch64ISD::EORV_PRED) + MAKE_CASE(AArch64ISD::ANDV_PRED) + MAKE_CASE(AArch64ISD::CLASTA_N) + MAKE_CASE(AArch64ISD::CLASTB_N) + MAKE_CASE(AArch64ISD::LASTA) + MAKE_CASE(AArch64ISD::LASTB) + MAKE_CASE(AArch64ISD::REV) + MAKE_CASE(AArch64ISD::REINTERPRET_CAST) + MAKE_CASE(AArch64ISD::TBL) + MAKE_CASE(AArch64ISD::FADD_PRED) + MAKE_CASE(AArch64ISD::FADDA_PRED) + MAKE_CASE(AArch64ISD::FADDV_PRED) + MAKE_CASE(AArch64ISD::FMA_PRED) + MAKE_CASE(AArch64ISD::FMAXV_PRED) + MAKE_CASE(AArch64ISD::FMAXNMV_PRED) + MAKE_CASE(AArch64ISD::FMINV_PRED) + MAKE_CASE(AArch64ISD::FMINNMV_PRED) + MAKE_CASE(AArch64ISD::NOT) + MAKE_CASE(AArch64ISD::BIT) + MAKE_CASE(AArch64ISD::CBZ) + MAKE_CASE(AArch64ISD::CBNZ) + MAKE_CASE(AArch64ISD::TBZ) + MAKE_CASE(AArch64ISD::TBNZ) + MAKE_CASE(AArch64ISD::TC_RETURN) + MAKE_CASE(AArch64ISD::PREFETCH) + MAKE_CASE(AArch64ISD::SITOF) + MAKE_CASE(AArch64ISD::UITOF) + MAKE_CASE(AArch64ISD::NVCAST) + MAKE_CASE(AArch64ISD::SQSHL_I) + MAKE_CASE(AArch64ISD::UQSHL_I) + MAKE_CASE(AArch64ISD::SRSHR_I) + MAKE_CASE(AArch64ISD::URSHR_I) + MAKE_CASE(AArch64ISD::SQSHLU_I) + MAKE_CASE(AArch64ISD::WrapperLarge) + MAKE_CASE(AArch64ISD::LD2post) + MAKE_CASE(AArch64ISD::LD3post) + MAKE_CASE(AArch64ISD::LD4post) + MAKE_CASE(AArch64ISD::ST2post) + MAKE_CASE(AArch64ISD::ST3post) + MAKE_CASE(AArch64ISD::ST4post) + MAKE_CASE(AArch64ISD::LD1x2post) + MAKE_CASE(AArch64ISD::LD1x3post) + MAKE_CASE(AArch64ISD::LD1x4post) + MAKE_CASE(AArch64ISD::ST1x2post) + MAKE_CASE(AArch64ISD::ST1x3post) + MAKE_CASE(AArch64ISD::ST1x4post) + MAKE_CASE(AArch64ISD::LD1DUPpost) + MAKE_CASE(AArch64ISD::LD2DUPpost) + MAKE_CASE(AArch64ISD::LD3DUPpost) + MAKE_CASE(AArch64ISD::LD4DUPpost) + MAKE_CASE(AArch64ISD::LD1LANEpost) + MAKE_CASE(AArch64ISD::LD2LANEpost) + MAKE_CASE(AArch64ISD::LD3LANEpost) + MAKE_CASE(AArch64ISD::LD4LANEpost) + MAKE_CASE(AArch64ISD::ST2LANEpost) + MAKE_CASE(AArch64ISD::ST3LANEpost) + MAKE_CASE(AArch64ISD::ST4LANEpost) + MAKE_CASE(AArch64ISD::SMULL) + MAKE_CASE(AArch64ISD::UMULL) + MAKE_CASE(AArch64ISD::FRECPE) + MAKE_CASE(AArch64ISD::FRECPS) + MAKE_CASE(AArch64ISD::FRSQRTE) + MAKE_CASE(AArch64ISD::FRSQRTS) + MAKE_CASE(AArch64ISD::STG) + MAKE_CASE(AArch64ISD::STZG) + MAKE_CASE(AArch64ISD::ST2G) + MAKE_CASE(AArch64ISD::STZ2G) + MAKE_CASE(AArch64ISD::SUNPKHI) + MAKE_CASE(AArch64ISD::SUNPKLO) + MAKE_CASE(AArch64ISD::UUNPKHI) + MAKE_CASE(AArch64ISD::UUNPKLO) + MAKE_CASE(AArch64ISD::INSR) + MAKE_CASE(AArch64ISD::PTEST) + MAKE_CASE(AArch64ISD::PTRUE) + MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) + MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::ST1_PRED) + MAKE_CASE(AArch64ISD::SST1_PRED) + MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) + MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) + MAKE_CASE(AArch64ISD::SST1_IMM_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_PRED) + MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) + MAKE_CASE(AArch64ISD::LDP) + MAKE_CASE(AArch64ISD::STP) + MAKE_CASE(AArch64ISD::STNP) + MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::INDEX_VECTOR) + } +#undef MAKE_CASE return nullptr; } @@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } -MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( - MachineInstr &MI, MachineBasicBlock *BB) const { - MI.eraseFromParent(); - return BB; -} - MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); - case AArch64::CATCHPAD: - return EmitLoweredCatchPad(MI, BB); } } @@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) { (CC == ISD::SETEQ || CC == ISD::SETNE); } +static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, + SelectionDAG &DAG, SDValue Chain, + bool IsSignaling) { + EVT VT = LHS.getValueType(); + assert(VT != MVT::f128); + assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); + unsigned Opcode = + IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; + return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); +} + static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); @@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; LHS = LHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && - !isUnsignedIntSetCC(CC)) { - // Similarly, (CMP (and X, Y), 0) can be implemented with a TST - // (a.k.a. ANDS) except that the flags are only guaranteed to work for one - // of the signed comparisons. - Opcode = AArch64ISD::ANDS; - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); + } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { + if (LHS.getOpcode() == ISD::AND) { + // Similarly, (CMP (and X, Y), 0) can be implemented with a TST + // (a.k.a. ANDS) except that the flags are only guaranteed to work for one + // of the signed comparisons. + const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, + DAG.getVTList(VT, MVT_CC), + LHS.getOperand(0), + LHS.getOperand(1)); + // Replace all users of (and X, Y) with newly generated (ands X, Y) + DAG.ReplaceAllUsesWith(LHS, ANDSNode); + return ANDSNode.getValue(1); + } else if (LHS.getOpcode() == AArch64ISD::ANDS) { + // Use result of ANDS + return LHS.getValue(1); + } } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) @@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned Offset = IsStrict ? 1 : 0; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end()); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; -} - -// Returns true if the given Op is the overflow flag result of an overflow -// intrinsic operation. -static bool isOverflowIntrOpRes(SDValue Op) { - unsigned Opc = Op.getOpcode(); - return (Op.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. - if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { + if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); @@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType() != MVT::f128) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcVT = SrcVal.getValueType(); + + if (SrcVT != MVT::f128) { + // Expand cases where the input is a vector bigger than NEON. + if (useSVEForFixedLengthVectorVT(SrcVT)) + return SDValue(); + // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. - SDValue SrcVal = Op.getOperand(0); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, - SDLoc(Op)).first; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Result; + SDLoc dl(Op); + std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, + CallOptions, dl, Chain); + return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType().isVector()) + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + + if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. - if (Op.getOperand(0).getValueType() == MVT::f16 && - !Subtarget->hasFullFP16()) { + if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), - DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); } - if (Op.getOperand(0).getValueType() != MVT::f128) { + if (SrcVal.getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; + return LowerF128Call(Op, DAG, LC); } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); + // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { + assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, - DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), + DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. - if (Op.getOperand(0).getValueType() == MVT::i128) + if (SrcVal.getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based @@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return Op; RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP) - LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + if (Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP) + LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); else - LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } @@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() != MVT::f16) + EVT OpVT = Op.getValueType(); + if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); @@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( - DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } @@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, - MVT::i64)); + SDValue Chain = Op.getOperand(0); + SDValue FPCR_64 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, + {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); + Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); + SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); + return DAG.getMergeValues({AND, Chain}, dl); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { @@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, + int Pattern) { + return DAG.getNode(AArch64ISD::PTRUE, DL, VT, + DAG.getTargetConstant(Pattern, DL, MVT::i32)); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_dupq_lane: + return LowerDUPQLane(Op, DAG); + case Intrinsic::aarch64_sve_convert_from_svbool: + return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_convert_to_svbool: { + EVT OutVT = Op.getValueType(); + EVT InVT = Op.getOperand(1).getValueType(); + // Return the operand if the cast isn't changing type, + // i.e. <n x 16 x i1> -> <n x 16 x i1> + if (InVT == OutVT) + return Op.getOperand(1); + // Otherwise, zero the newly introduced lanes. + SDValue Reinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1)); + SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all); + SDValue MaskReinterpret = + DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask); + return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret); + } case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); @@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } + + case Intrinsic::aarch64_neon_vsri: + case Intrinsic::aarch64_neon_vsli: { + EVT Ty = Op.getValueType(); + + if (!Ty.isVector()) + report_fatal_error("Unexpected type for aarch64_neon_vsli"); + + assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); + + bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; + unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3)); + } + + case Intrinsic::aarch64_neon_srhadd: + case Intrinsic::aarch64_neon_urhadd: { + bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd; + unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } } } @@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { + if (useSVEForFixedLengthVectorVT(VT)) + return LowerFixedLengthVectorStoreToSVE(Op, DAG); + unsigned AS = StoreNode->getAddressSpace(); - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, + Align Alignment = StoreNode->getAlign(); + if (Alignment < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); @@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } + // 256 bit non-temporal stores can be lowered to STNP. Do this as part of + // the custom lowering, as there are no un-paired non-temporal stores and + // legalization will break up 256 bit inputs. + if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && + MemVT.getVectorElementCount().Min % 2u == 0 && + ((MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u))) { + SDValue Lo = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; + } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { assert(StoreNode->getValue()->getValueType(0) == MVT::i128); SDValue Lo = @@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); @@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FMA: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::SDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); + case ISD::UDIV: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); + case ISD::SMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1); + case ISD::UMIN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1); + case ISD::SMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1); + case ISD::UMAX: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1); case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -3218,9 +3571,68 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::VSCALE: + return LowerVSCALE(Op, DAG); + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); + case ISD::LOAD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerFixedLengthVectorLoadToSVE(Op, DAG); + llvm_unreachable("Unexpected request to lower ISD::LOAD"); + case ISD::ADD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); + llvm_unreachable("Unexpected request to lower ISD::ADD"); } } +bool AArch64TargetLowering::useSVEForFixedLengthVectors() const { + // Prefer NEON unless larger SVE registers are available. + return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; +} + +bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const { + if (!useSVEForFixedLengthVectors()) + return false; + + if (!VT.isFixedLengthVector()) + return false; + + // Fixed length predicates should be promoted to i8. + // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. + if (VT.getVectorElementType() == MVT::i1) + return false; + + // Don't use SVE for vectors we cannot scalarize if required. + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f16: + case MVT::f32: + case MVT::f64: + break; + } + + // Ensure NEON MVTs only belong to a single register class. + if (VT.getSizeInBits() <= 128) + return false; + + // Don't use SVE for types that don't fit. + if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) + return false; + + // TODO: Perhaps an artificial restriction, but worth having whilst getting + // the base fixed length SVE support in place. + if (!VT.isPow2VectorType()) + return false; + + return true; +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { default: report_fatal_error("Unsupported calling convention."); - case CallingConv::AArch64_SVE_VectorCall: - // Calling SVE functions is currently not yet supported. - report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: + case CallingConv::AArch64_SVE_VectorCall: return CC_AArch64_AAPCS; } } @@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; - else if (RegVT == MVT::f16) + else if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; @@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; @@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; + unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect + ? VA.getLocVT().getSizeInBits() + : VA.getValVT().getSizeInBits()) / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && @@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MemVT = VA.getLocVT(); + break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MemVT); } + + if (VA.getLocInfo() == CCValAssign::Indirect) { + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + // If value is passed via pointer - do a load. + ArgValue = + DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); + } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), ArgValue, DAG.getValueType(MVT::i32)); @@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { - FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); + FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); @@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + // When using the Windows calling convention on a non-windows OS, we want + // to back up and restore X18 in such functions; we can't do a tail call + // from those functions. + if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && + CalleeCC != CallingConv::Win64) + return false; + // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. @@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + // If any of the arguments is passed indirectly, it must be SVE, so the + // 'getBytesInStackArgArea' is not sufficient to determine whether we need to + // allocate space on the stack. That is why we determine this explicitly here + // the call cannot be a tailcall. + if (llvm::any_of(ArgLocs, [](CCValAssign &A) { + assert((A.getLocInfo() != CCValAssign::Indirect || + A.getValVT().isScalableVector()) && + "Expected value to be scalable"); + return A.getLocInfo() == CCValAssign::Indirect; + })) + return false; + // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) @@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) { + if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); @@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); + Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); + int FI = MFI.CreateStackObject( + VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); + MFI.setStackID(FI, TargetStackID::SVEVector); + + SDValue SpillSlot = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getStore( + Chain, DL, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + Arg = SpillSlot; + break; } if (VA.isRegLoc()) { @@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); } } else { @@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; - unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getValVT().getSizeInBits(); + unsigned OpSize; + if (VA.getLocInfo() == CCValAssign::Indirect) + OpSize = VA.getLocVT().getSizeInBits(); + else + OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { @@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + Chain, DL, DstAddr, Arg, SizeNode, + Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, - /*isTailCall = */ false, - DstInfo, MachinePointerInfo()); + /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { - return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), N->getOffset(), Flag); } @@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. - if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && + if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) @@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { Cmp); } - assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || - LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || + LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; + } else if (VT == MVT::i128) { + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || @@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Chain; + if (IsStrict) + Chain = Op.getOperand(0); + SDValue LHS = Op.getOperand(OpNo + 0); + SDValue RHS = Op.getOperand(OpNo + 1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. @@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, + IsSignaling); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); - return LHS; + return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; } } @@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } // Now we know we're dealing with FP values. @@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + SDValue Cmp; + if (IsStrict) + Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); + else + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue Res; if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); @@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. - return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in @@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); - return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } + return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, @@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue FVal = Op->getOperand(2); SDLoc DL(Op); + EVT Ty = Op.getValueType(); + if (Ty.isScalableVector()) { + SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); + MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); + SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); + return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); + } + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. - if (isOverflowIntrOpRes(CCVal)) { + if (ISD::isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); @@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, - false, false, false, MachinePointerInfo(DestSV), - MachinePointerInfo(SrcSV)); + DAG.getConstant(VaListSize, DL, MVT::i32), + Align(PtrSize), false, false, false, + MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(3); + MaybeAlign Align(Op.getConstantOperandVal(3)); unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); @@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain = VAList.getValue(1); VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > MinSlotSize) { - assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); + if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Align - 1, DL, PtrVT)); + DAG.getConstant(Align->value() - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-(int64_t)Align, DL, PtrVT)); + DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16) + VT.getVectorElementType() == MVT::f16 || + VT.getVectorElementType() == MVT::bf16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16 || EltTy == MVT::f16) + else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; @@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16 || EltType == MVT::f16) + if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, // Extend input splat value where needed to fit into a GPR (32b or 64b only) // FPRs don't have this restriction. switch (ElemVT.getSimpleVT().SimpleTy) { - case MVT::i8: - case MVT::i16: - case MVT::i32: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); - case MVT::i64: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); case MVT::i1: { + // The only legal i1 vectors are SVE vectors, so we can use SVE-specific + // lowering code. + if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { + if (ConstVal->isOne()) + return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); + // TODO: Add special case for constant false + } // The general case of i1. There isn't any natural way to do this, // so we use some trickery with whilelo. - // TODO: Add special cases for splat of constant true/false. SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, DAG.getValueType(MVT::i1)); @@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, DAG.getConstant(0, dl, MVT::i64), SplatVal); } - // TODO: we can support float types, but haven't added patterns yet. + case MVT::i8: + case MVT::i16: + case MVT::i32: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; case MVT::f16: + case MVT::bf16: case MVT::f32: case MVT::f64: + // Fine as is + break; default: report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + +SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + if (!isTypeLegal(VT) || !VT.isScalableVector()) + return SDValue(); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + // The DUPQ operation is indepedent of element type so normalise to i64s. + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + SDValue Idx128 = Op.getOperand(2); + + // DUPQ can be used when idx is in range. + auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); + if (CIdx && (CIdx->getZExtValue() <= 3)) { + SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); + SDNode *DUPQ = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + } + + // The ACLE says this must produce the same result as: + // svtbl(data, svadd_x(svptrue_b64(), + // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), + // index * 2)) + SDValue One = DAG.getConstant(1, DL, MVT::i64); + SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); + + // create the vector 0,1,0,1,... + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, + DL, MVT::nxv2i64, Zero, One); + SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); + + // create the vector idx64,idx64+1,idx64,idx64+1,... + SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); + SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); + SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); + + // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... + SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); + return DAG.getNode(ISD::BITCAST, DL, VT, TBL); } + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) { // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. +// BUILD_VECTORs with constant element C1, C2 is a constant, and: +// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) +// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) +// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // Is the first op an AND? - const SDValue And = N->getOperand(0); - if (And.getOpcode() != ISD::AND) + SDValue And; + SDValue Shift; + + SDValue FirstOp = N->getOperand(0); + unsigned FirstOpc = FirstOp.getOpcode(); + SDValue SecondOp = N->getOperand(1); + unsigned SecondOpc = SecondOp.getOpcode(); + + // Is one of the operands an AND or a BICi? The AND may have been optimised to + // a BICi in order to use an immediate instead of a register. + // Is the other operand an shl or lshr? This will have been turned into: + // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. + if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && + (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { + And = FirstOp; + Shift = SecondOp; + + } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && + (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { + And = SecondOp; + Shift = FirstOp; + } else return SDValue(); - // Is the second op an shl or lshr? - SDValue Shift = N->getOperand(1); - // This will have been turned into: AArch64ISD::VSHL vector, #shift - // or AArch64ISD::VLSHR vector, #shift - unsigned ShiftOpc = Shift.getOpcode(); - if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) - return SDValue(); - bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + bool IsAnd = And.getOpcode() == ISD::AND; + bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); if (!C2node) return SDValue(); - // Is the and mask vector all constant? uint64_t C1; - if (!isAllConstantBuildVector(And.getOperand(1), C1)) - return SDValue(); + if (IsAnd) { + // Is the and mask vector all constant? + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + } else { + // Reconstruct the corresponding AND immediate from the two BICi immediates. + ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); + ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); + assert(C1nodeImm && C1nodeShift); + C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); + } - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? + // Is C1 == ~(Ones(ElemSizeInBits) << C2) or + // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account + // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); - unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) + + APInt C1AsAPInt(ElemSizeInBits, C1); + APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) + : APInt::getLowBitsSet(ElemSizeInBits, C2); + if (C1AsAPInt != RequiredC1) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); - unsigned Intrin = - IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; - SDValue ResultSLI = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, DL, MVT::i32), X, Y, - Shift.getOperand(1)); + unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); @@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableAArch64SlrGeneration) { - if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) - return Res; - } + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) + return Res; EVT VT = Op.getValueType(); @@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; EVT EltTy = VT.getVectorElementType(); - assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && - "Unsupported floating-point vector type"); + assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || + EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); @@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16) + VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && + VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getOperand(0).getValueType(); - SDLoc dl(Op); - // Just in case... - if (!VT.isVector()) - return SDValue(); - - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!Cst) - return SDValue(); - unsigned Val = Cst->getZExtValue(); + assert(Op.getValueType().isFixedLengthVector() && + "Only cases that extract a fixed length vector are supported!"); + EVT InVT = Op.getOperand(0).getValueType(); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); + if (InVT.isScalableVector()) { + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG)) + return Op; + + return SDValue(); + } + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. - if (Val == 0) + if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) + return Op; + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType().isScalableVector() && + "Only expect to lower inserts into scalable vectors!"); + + EVT InVT = Op.getOperand(1).getValueType(); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + + // We don't have any patterns for scalable vector yet. + if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT)) + return SDValue(); + + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { + // Currently no fixed length shuffles that require SVE are legal. + if (useSVEForFixedLengthVectorVT(VT)) + return false; + if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; @@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). +// The original form of this expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function +// is called the srl will have been lowered to AArch64ISD::VLSHR and the +// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). +// This pass can also recognize a variant of this pattern that uses sign +// extension instead of zero extension and form a srhadd(OpA, OpB) from it. +SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (!VT.isVector() || VT.isScalableVector()) + return Op; + + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + return LowerFixedLengthVectorTruncateToSVE(Op, DAG); + + // Since we are looking for a right shift by a constant value of 1 and we are + // operating on types at least 16 bits in length (sign/zero extended OpA and + // OpB, which are at least 8 bits), it follows that the truncate will always + // discard the shifted-in bit and therefore the right shift will be logical + // regardless of the signedness of OpA and OpB. + SDValue Shift = Op.getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return Op; + + // Is the right shift using an immediate value of 1? + uint64_t ShiftAmount = Shift.getConstantOperandVal(1); + if (ShiftAmount != 1) + return Op; + + SDValue Sub = Shift->getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return Op; + + SDValue Xor = Sub.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return Op; + + SDValue ExtendOpA = Xor.getOperand(0); + SDValue ExtendOpB = Sub.getOperand(0); + unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); + unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); + if (!(ExtendOpAOpc == ExtendOpBOpc && + (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + return Op; + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ExtendOpA.getOperand(0); + SDValue OpB = ExtendOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return Op; + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return Op; + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return Op; + + SDLoc DL(Op); + bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; + unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB); + + return ResultURHADD; +} + SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, llvm_unreachable("unexpected shift opcode"); case ISD::SHL: + if (VT.isScalableVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1); + if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); @@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: + if (VT.isScalableVector()) { + unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1 + : AArch64ISD::SRL_MERGE_OP1; + return LowerToPredicatedOp(Op, DAG, Opc); + } + // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = @@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isScalableVector()) { + if (Op.getOperand(0).getValueType().isFloatingPoint()) + return Op; + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); + } + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + MaybeAlign Align = + cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( @@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); @@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), @@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, dl); } +SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT != MVT::i64 && "Expected illegal VSCALE node"); + + SDLoc DL(Op); + APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); + return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), + DL, VT); +} + +/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. +template <unsigned NumVecs> +static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info, + const CallInst &CI) { + Info.opc = ISD::INTRINSIC_VOID; + // Retrieve EC from first vector argument. + const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType()); + ElementCount EC = VT.getVectorElementCount(); +#ifndef NDEBUG + // Check the assumption that all input vectors are the same type. + for (unsigned I = 0; I < NumVecs; ++I) + assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) && + "Invalid type."); +#endif + // memVT is `NumVecs * VT`. + Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), + EC * NumVecs); + Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1); + Info.offset = 0; + Info.align.reset(); + Info.flags = MachineMemOperand::MOStore; + return true; +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { + case Intrinsic::aarch64_sve_st2: + return setInfoSVEStN<2>(Info, I); + case Intrinsic::aarch64_sve_st3: + return setInfoSVEStN<3>(Info, I); + case Intrinsic::aarch64_sve_st4: + return setInfoSVEStN<4>(Info, I); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_sve_ldnt1: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOLoad; + if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_sve_stnt1: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.flags = MachineMemOperand::MOStore; + if (Intrinsic == Intrinsic::aarch64_sve_stnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } default: @@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { auto areTypesHalfed = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast<VectorType>(FullV->getType()); - auto *HalfVT = cast<VectorType>(HalfV->getType()); - return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + auto *FullTy = FullV->getType(); + auto *HalfTy = HalfV->getType(); + return FullTy->getPrimitiveSizeInBits().getFixedSize() == + 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize(); }; auto extractHalf = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast<VectorType>(FullV->getType()); - auto *HalfVT = cast<VectorType>(HalfV->getType()); + auto *FullVT = cast<FixedVectorType>(FullV->getType()); + auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); }; - Constant *M1, *M2; + ArrayRef<int> M1, M2; Value *S1Op1, *S2Op1; - if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || - !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) + if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || + !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) return false; // Check that the operands are half as wide as the result and we extract @@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { // elements. int M1Start = -1; int M2Start = -1; - int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2; + int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) @@ -8948,6 +9682,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) { return true; } +/// Check if Op could be used with vmull_high_p64 intrinsic. +static bool isOperandOfVmullHighP64(Value *Op) { + Value *VectorOperand = nullptr; + ConstantInt *ElementIndex = nullptr; + return match(Op, m_ExtractElt(m_Value(VectorOperand), + m_ConstantInt(ElementIndex))) && + ElementIndex->getValue() == 1 && + isa<FixedVectorType>(VectorOperand->getType()) && + cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; +} + +/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. +static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { + return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands( Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; + + case Intrinsic::aarch64_neon_pmull64: + if (!areOperandsOfVmullHighP64(II->getArgOperand(0), + II->getArgOperand(1))) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; + default: return false; } @@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands( } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, - unsigned &RequiredAligment) const { + Align &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. - RequiredAligment = 0; + RequiredAligment = Align(1); unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } @@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } MachineMemOperand::Flags -AArch64TargetLowering::getMMOFlags(const Instruction &I) const { +AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; @@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the number of vector elements is greater than 1. - if (VecTy->getNumElements() < 2) + if (cast<FixedVectorType>(VecTy)->getNumElements() < 2) return false; // Ensure the element type is legal. @@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); - VectorType *VecTy = Shuffles[0]->getType(); + VectorType *VTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) return false; - unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); + + auto *FVTy = cast<FixedVectorType>(VTy); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. - Type *EltTy = VecTy->getVectorElementType(); + Type *EltTy = FVTy->getElementType(); if (EltTy->isPointerTy()) - VecTy = - VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + FVTy = + FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); IRBuilder<> Builder(LI); @@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / NumLoads); + FVTy = FixedVectorType::get(FVTy->getElementType(), + FVTy->getNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace())); + BaseAddr, + FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } - Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); - Type *Tys[2] = {VecTy, PtrTy}; + Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {FVTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; @@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = - Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, - VecTy->getVectorNumElements() * Factor); + BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, + FVTy->getNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); @@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, VectorType::get(SVI->getType()->getVectorElementType(), - VecTy->getVectorNumElements())); + SubVec, FixedVectorType::get(SVI->getType()->getElementType(), + FVTy->getNumElements())); SubVecs[SVI].push_back(SubVec); } } @@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - VectorType *VecTy = SVI->getType(); - assert(VecTy->getVectorNumElements() % Factor == 0 && - "Invalid interleaved store"); + auto *VecTy = cast<FixedVectorType>(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned LaneLen = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); - unsigned NumOpElts = Op0->getType()->getVectorNumElements(); + unsigned NumOpElts = + cast<FixedVectorType>(Op0->getType())->getNumElements(); // Convert to the corresponding integer vector. - Type *IntVecTy = VectorType::get(IntTy, NumOpElts); + auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, LaneLen); + SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( - SI->getPointerAddressSpace())); + BaseAddr, + SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); @@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); } } // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); @@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); +// Lower an SVE structured load intrinsic returning a tuple type to target +// specific intrinsic taking the same input but returning a multi-result value +// of the split tuple type. +// +// E.g. Lowering an LD3: +// +// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32( +// <vscale x 4 x i1> %pred, +// <vscale x 4 x i32>* %addr) +// +// Output DAG: +// +// t0: ch = EntryToken +// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0 +// t4: i64,ch = CopyFromReg t0, Register:i64 %1 +// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4 +// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2 +// +// This is called pre-legalization to avoid widening/splitting issues with +// non-power-of-2 tuple types used for LD3, such as nxv12i32. +SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, + ArrayRef<SDValue> LoadOps, + EVT VT, SelectionDAG &DAG, + const SDLoc &DL) const { + assert(VT.isScalableVector() && "Can only lower scalable vectors"); + + unsigned N, Opcode; + static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = { + {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; + + std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; + assert(VT.getVectorElementCount().Min % N == 0 && + "invalid tuple vector type!"); + + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount() / N); + assert(isTypeLegal(SplitVT)); + + SmallVector<EVT, 5> VTs(N, SplitVT); + VTs.push_back(MVT::Other); // Chain + SDVTList NodeTys = DAG.getVTList(VTs); + + SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps); + SmallVector<SDValue, 4> PseudoLoadOps; + for (unsigned I = 0; I < N; ++I) + PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps); } EVT AArch64TargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return MVT::v2i64; - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return MVT::f128; - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return MVT::i64; - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return MVT::i32; return MVT::Other; } LLT AArch64TargetLowering::getOptimalMemOpLLT( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; @@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. - bool IsSmallMemset = IsMemset && Size < 32; - auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { - if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + bool IsSmallMemset = Op.isMemset() && Op.size() < 32; + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, @@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( Fast; }; - if (CanUseNEON && IsMemset && !IsSmallMemset && - AlignmentIsAcceptable(MVT::v2i64, 16)) + if (CanUseNEON && Op.isMemset() && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, Align(16))) return LLT::vector(2, 64); - if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return LLT::scalar(128); - if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return LLT::scalar(64); - if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return LLT::scalar(32); return LLT(); } @@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; + // FIXME: Update this method to support scalable addressing modes. + if (isa<ScalableVectorType>(Ty)) + return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; @@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N, } if (FoundMatch) - return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } @@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); + unsigned Opc = Src->getOpcode(); + + // Zero/any extend of an unsigned unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + SDValue UnpkOp = Src->getOperand(0); + SDValue Dup = N->getOperand(1); + + if (Dup.getOpcode() != AArch64ISD::DUP) + return SDValue(); + + SDLoc DL(N); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); + uint64_t ExtVal = C->getZExtValue(); + + // If the mask is fully covered by the unpack, we don't need to push + // a new AND onto the operand + EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); + if ((ExtVal == 0xFF && EltTy == MVT::i8) || + (ExtVal == 0xFFFF && EltTy == MVT::i16) || + (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32)) + return Src; + + // Truncate to prevent a DUP with an over wide constant + APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); + + // Otherwise, make sure we propagate the AND to the operand + // of the unpack + Dup = DAG.getNode(AArch64ISD::DUP, DL, + UnpkOp->getValueType(0), + DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); + + SDValue And = DAG.getNode(ISD::AND, DL, + UnpkOp->getValueType(0), UnpkOp, Dup); + + return DAG.getNode(Opc, DL, N->getValueType(0), And); + } + SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) return SDValue(); - // GLD1* instructions perform an implicit zero-extend, which makes them + EVT MemVT; + + // SVE load instructions perform an implicit zero-extend, which makes them // perfect candidates for combining. - switch (Src->getOpcode()) { - case AArch64ISD::GLD1: - case AArch64ISD::GLD1_SCALED: - case AArch64ISD::GLD1_SXTW: - case AArch64ISD::GLD1_SXTW_SCALED: - case AArch64ISD::GLD1_UXTW: - case AArch64ISD::GLD1_UXTW_SCALED: - case AArch64ISD::GLD1_IMM: + switch (Opc) { + case AArch64ISD::LD1_MERGE_ZERO: + case AArch64ISD::LDNF1_MERGE_ZERO: + case AArch64ISD::LDFF1_MERGE_ZERO: + MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); + break; + case AArch64ISD::GLD1_MERGE_ZERO: + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + case AArch64ISD::GLDFF1_MERGE_ZERO: + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + case AArch64ISD::GLDNT1_MERGE_ZERO: + MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); break; default: return SDValue(); } - EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); - if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) return Src; @@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., @@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N, // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. - if (N->getNumOperands() == 2 && - N0->getOpcode() == ISD::TRUNCATE && - N1->getOpcode() == ISD::TRUNCATE) { + if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && + N1Opc == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); @@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); + // Optimise concat_vectors of two [us]rhadds that use extracted subvectors + // from the same original vectors. Combine these into a single [us]rhadd that + // operates on the two original vectors. Example: + // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), + // extract_subvector (v16i8 OpB, + // <0>))), + // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), + // extract_subvector (v16i8 OpB, + // <8>))))) + // -> + // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + if (N->getNumOperands() == 2 && N0Opc == N1Opc && + (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) { + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + + EVT N00VT = N00.getValueType(); + EVT N10VT = N10.getValueType(); + + if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { + SDValue N00Source = N00->getOperand(0); + SDValue N01Source = N01->getOperand(0); + SDValue N10Source = N10->getOperand(0); + SDValue N11Source = N11->getOperand(0); + + if (N00Source == N10Source && N01Source == N11Source && + N00Source.getValueType() == VT && N01Source.getValueType() == VT) { + assert(N0.getValueType() == N1.getValueType()); + + uint64_t N00Index = N00.getConstantOperandVal(1); + uint64_t N01Index = N01.getConstantOperandVal(1); + uint64_t N10Index = N10.getConstantOperandVal(1); + uint64_t N11Index = N11.getConstantOperandVal(1); + + if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && + N10Index == N00VT.getVectorNumElements()) + return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); + } + } + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. @@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1->getOpcode() != ISD::BITCAST) + if (N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); @@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, return SDValue(); } +static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op1 = N->getOperand(1); + SDValue Op2 = N->getOperand(2); + EVT ScalarTy = Op1.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) { + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + } + + return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0), + Op1, Op2); +} + +static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + SDValue Scalar = N->getOperand(3); + EVT ScalarTy = Scalar.getValueType(); + + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); + + SDValue Passthru = N->getOperand(1); + SDValue Pred = N->getOperand(2); + return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), + Pred, Scalar, Passthru); +} + static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); LLVMContext &Ctx = *DAG.getContext(); @@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, EXT); } -static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, - bool Invert, +static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize()) @@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, } } + if (!Imm) + return SDValue(); + SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); - SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64); - SDValue Op0, Op1; - if (Invert) { - Op0 = Splat; - Op1 = N->getOperand(2); - } else { - Op0 = N->getOperand(2); - Op1 = Splat; - } - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - ID, Pred, Op0, Op1); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, + N->getOperand(2), Splat, DAG.getCondCode(CC)); } return SDValue(); @@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, return DAG.getZExtOrTrunc(Res, DL, VT); } +static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue VecToReduce = N->getOperand(2); + + EVT ReduceVT = VecToReduce.getValueType(); + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + +static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue InitVal = N->getOperand(2); + SDValue VecToReduce = N->getOperand(3); + EVT ReduceVT = VecToReduce.getValueType(); + + // Ordered reductions use the first lane of the result vector as the + // reduction's initial value. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, + DAG.getUNDEF(ReduceVT), InitVal, Zero); + + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N, return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); + case Intrinsic::aarch64_sve_index: + return LowerSVEIntrinsicIndex(N, DAG); + case Intrinsic::aarch64_sve_dup: + return LowerSVEIntrinsicDUP(N, DAG); + case Intrinsic::aarch64_sve_dup_x: + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), + N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_smin: + return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umin: + return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_smax: + return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_umax: + return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsl: + return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_lsr: + return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_asr: + return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sve_cmphs: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); + break; + case Intrinsic::aarch64_sve_cmphi: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); + break; + case Intrinsic::aarch64_sve_cmpge: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGE)); + break; + case Intrinsic::aarch64_sve_cmpgt: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGT)); + break; + case Intrinsic::aarch64_sve_cmpeq: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); + break; + case Intrinsic::aarch64_sve_cmpne: + if (!N->getOperand(2).getValueType().isFloatingPoint()) + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETNE)); + break; + case Intrinsic::aarch64_sve_fadda: + return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); + case Intrinsic::aarch64_sve_faddv: + return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxnmv: + return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxv: + return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_fminnmv: + return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fminv: + return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); + case Intrinsic::aarch64_sve_sel: + return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmpeq_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); case Intrinsic::aarch64_sve_cmpne_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); case Intrinsic::aarch64_sve_cmpge_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); case Intrinsic::aarch64_sve_cmpgt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplt_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); case Intrinsic::aarch64_sve_cmple_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, - true, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); case Intrinsic::aarch64_sve_cmphs_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); case Intrinsic::aarch64_sve_cmphi_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, - false, DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplo_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); case Intrinsic::aarch64_sve_cmpls_wide: - return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true, - DCI, DAG); + return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); case Intrinsic::aarch64_sve_ptest_any: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::ANY_ACTIVE); @@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N, if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); - // If the source VT is a 64-bit vector, we can play games and get the - // better results we want. - if (SrcVT.getSizeInBits() != 64) + // If the source VT is a 64-bit fixed or scalable vector, we can play games + // and get the better results we want. + if (SrcVT.getSizeInBits().getKnownMinSize() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); - unsigned ElementCount = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); + ElementCount SrcEC = SrcVT.getVectorElementCount(); + SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); @@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N, // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; - unsigned NumElements = ResVT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), - ResVT.getVectorElementType(), NumElements / 2); + LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext()); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - LoVT.getVectorNumElements()); + LoVT.getVectorElementCount()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); + DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, return NewST1; } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + case MVT::nxv8i8: + case MVT::nxv8i16: + case MVT::nxv8f16: + case MVT::nxv8bf16: + return MVT::nxv8i16; + case MVT::nxv16i8: + return MVT::nxv16i8; + } +} + +static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + EVT ContainerVT = VT; + if (ContainerVT.isInteger()) + ContainerVT = getSVEContainerType(ContainerVT); + + SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); + SDValue Ops[] = { N->getOperand(0), // Chain + N->getOperand(2), // Pg + N->getOperand(3), // Base + DAG.getValueType(VT) }; + + SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (ContainerVT.isInteger() && (VT != ContainerVT)) + Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({ Load, LoadChain }, DL); +} + static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); + if (VT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); @@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } +template <unsigned Opcode> +static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { + static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || + Opcode == AArch64ISD::LD1RO_MERGE_ZERO, + "Unsupported opcode."); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; + SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (VT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + +static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + EVT HwSrcVt = getSVEContainerType(DataVT); + SDValue InputVT = DAG.getValueType(DataVT); + + if (DataVT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + if (DataVT.isFloatingPoint()) + InputVT = DAG.getValueType(HwSrcVt); + + SDValue SrcNew; + if (Data.getValueType().isFloatingPoint()) + SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); + else + SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); + + SDValue Ops[] = { N->getOperand(0), // Chain + SrcNew, + N->getOperand(4), // Base + N->getOperand(3), // Pg + InputVT + }; + + return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); +} + static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); @@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { EVT DataVT = Data.getValueType(); EVT PtrTy = N->getOperand(4).getValueType(); + if (DataVT == MVT::nxv8bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + if (DataVT.isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); @@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); + // Avoid scalarizing zero splat stores for scalable vectors. + if (VT.isScalableVector()) + return SDValue(); + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); @@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); - if (!VT.isVector()) + + if (!VT.isFixedLengthVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of @@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT.isScalableVector()) + return SDValue(); + unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. @@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(MinOffset, DL, MVT::i64)); } -// Returns an SVE type that ContentTy can be trivially sign or zero extended -// into. -static MVT getSVEContainerType(EVT ContentTy) { - assert(ContentTy.isSimple() && "No SVE containers for extended types"); +// Turns the vector of indices into a vector of byte offstes by scaling Offset +// by (BitWidth / 8). +static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, + SDLoc DL, unsigned BitWidth) { + assert(Offset.getValueType().isScalableVector() && + "This method is only for scalable vectors of offsets"); - switch (ContentTy.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("No known SVE container for this MVT type"); - case MVT::nxv2i8: - case MVT::nxv2i16: - case MVT::nxv2i32: - case MVT::nxv2i64: - case MVT::nxv2f32: - case MVT::nxv2f64: - return MVT::nxv2i64; - case MVT::nxv4i8: - case MVT::nxv4i16: - case MVT::nxv4i32: - case MVT::nxv4f32: - return MVT::nxv4i32; - } + SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); + SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); + + return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); } -static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { +/// Check if the value of \p OffsetInBytes can be used as an immediate for +/// the gather load/prefetch and scatter store instructions with vector base and +/// immediate offset addressing mode: +/// +/// [<Zn>.[S|D]{, #<imm>}] +/// +/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. + +inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, + unsigned ScalarSizeInBytes) { + // The immediate is not a multiple of the scalar size. + if (OffsetInBytes % ScalarSizeInBytes) + return false; + + // The immediate is out of range. + if (OffsetInBytes / ScalarSizeInBytes > 31) + return false; + + return true; +} + +/// Check if the value of \p Offset represents a valid immediate for the SVE +/// gather load/prefetch and scatter store instructiona with vector base and +/// immediate offset addressing mode: +/// +/// [<Zn>.[S|D]{, #<imm>}] +/// +/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. +static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, + unsigned ScalarSizeInBytes) { + ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); + return OffsetConst && isValidImmForSVEVecImmAddrMode( + OffsetConst->getZExtValue(), ScalarSizeInBytes); +} + +static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { const SDValue Src = N->getOperand(2); const EVT SrcVT = Src->getValueType(0); assert(SrcVT.isScalableVector() && @@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(4); + SDValue Base = N->getOperand(4); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal scatters because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { + Offset = + getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); + Opcode = AArch64ISD::SSTNT1_PRED; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // SST1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the stored items. For + // immediates outside that range and non-immediate scalar offsets use SST1 or + // SST1_UXTW instead. + if (Opcode == AArch64ISD::SST1_IMM_PRED) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + SrcVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = AArch64ISD::SST1_UXTW_PRED; + else + Opcode = AArch64ISD::SST1_PRED; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, // Source value type that is representable in hardware EVT HwSrcVt = getSVEContainerType(SrcVT); - // Keep the original type of the input data to store - this is needed to - // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the - // integer equivalent, so just use HwSrcVt. + // Keep the original type of the input data to store - this is needed to be + // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For + // FP values we want the integer equivalent, so just use HwSrcVt. SDValue InputVT = DAG.getValueType(SrcVT); if (SrcVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); @@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, VTs, Ops); } -static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, - unsigned Opcode, - bool OnlyPackedOffsets = true) { - EVT RetVT = N->getValueType(0); +static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + const EVT RetVT = N->getValueType(0); assert(RetVT.isScalableVector() && "Gather loads are only possible for SVE vectors"); + SDLoc DL(N); + // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) - const SDValue Base = N->getOperand(3); + SDValue Base = N->getOperand(3); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal gathers because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { + Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, + RetVT.getScalarSizeInBits()); + Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; + } + + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && + Offset.getValueType().isVector()) + std::swap(Base, Offset); + + // GLD{FF}1_IMM requires that the offset is an immediate that is: + // * a multiple of #SizeInBytes, + // * in the range [0, 31 x #SizeInBytes], + // where #SizeInBytes is the size in bytes of the loaded items. For + // immediates outside that range and non-immediate scalar offsets use + // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. + if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || + Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + RetVT.getScalarSizeInBits() / 8)) { + if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_UXTW_MERGE_ZERO + : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; + else + Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) + ? AArch64ISD::GLD1_MERGE_ZERO + : AArch64ISD::GLDFF1_MERGE_ZERO; + + std::swap(Base, Offset); + } + } + auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); @@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, // Return value type that is representable in hardware EVT HwRetVt = getSVEContainerType(RetVT); - // Keep the original output value type around - this will better inform - // optimisations (e.g. instruction folding when load is followed by - // zext/sext). This will only be used for ints, so the value for FPs - // doesn't matter. + // Keep the original output value type around - this is needed to be able to + // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP + // values we want the integer equivalent, so just use HwRetVT. SDValue OutVT = DAG.getValueType(RetVT); if (RetVT.isFloatingPoint()) OutVT = DAG.getValueType(HwRetVt); @@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, return DAG.getMergeValues({Load, LoadChain}, DL); } - static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDLoc DL(N); SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); - // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates + // Sign extend of an unsigned unpack -> signed unpack + if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { + + unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI + : AArch64ISD::SUNPKLO; + + // Push the sign extend to the operand of the unpack + // This is necessary where, for example, the operand of the unpack + // is another unpack: + // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) + // -> + // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) + // -> + // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) + SDValue ExtOp = Src->getOperand(0); + auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); + EVT EltTy = VT.getVectorElementType(); + (void)EltTy; + + assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && + "Sign extending from an invalid type"); + + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + VT.getVectorElementCount() * 2); + + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), + ExtOp, DAG.getValueType(ExtVT)); + + return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); + } + + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; + unsigned MemVTOpNum = 4; switch (Opc) { - case AArch64ISD::GLD1: - NewOpc = AArch64ISD::GLD1S; + case AArch64ISD::LD1_MERGE_ZERO: + NewOpc = AArch64ISD::LD1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDNF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::LDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; + MemVTOpNum = 3; + break; + case AArch64ISD::GLD1_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; break; - case AArch64ISD::GLD1_SCALED: - NewOpc = AArch64ISD::GLD1S_SCALED; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW: - NewOpc = AArch64ISD::GLD1S_SXTW; + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_SXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_SXTW_SCALED; + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW: - NewOpc = AArch64ISD::GLD1S_UXTW; + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; break; - case AArch64ISD::GLD1_UXTW_SCALED: - NewOpc = AArch64ISD::GLD1S_UXTW_SCALED; + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; break; - case AArch64ISD::GLD1_IMM: - NewOpc = AArch64ISD::GLD1S_IMM; + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: + NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; + break; + case AArch64ISD::GLDNT1_MERGE_ZERO: + NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; break; default: return SDValue(); } EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); - EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); + EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); - if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse()) + if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) return SDValue(); EVT DstVT = N->getValueType(0); SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); - SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2), - Src->getOperand(3), Src->getOperand(4)}; + + SmallVector<SDValue, 5> Ops; + for (unsigned I = 0; I < Src->getNumOperands(); ++I) + Ops.push_back(Src->getOperand(I)); SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); DCI.CombineTo(N, ExtLoad); @@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(N, 0); } +/// Legalize the gather prefetch (scalar + vector addressing mode) when the +/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset +/// != nxv2i32) do not need legalization. +static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { + const unsigned OffsetPos = 4; + SDValue Offset = N->getOperand(OffsetPos); + + // Not an unpacked vector, bail out. + if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) + return SDValue(); + + // Extend the unpacked offset vector to 64-bit lanes. + SDLoc DL(N); + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); + SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); + // Replace the offset operand with the 64-bit one. + Ops[OffsetPos] = Offset; + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + +/// Combines a node carrying the intrinsic +/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses +/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to +/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the +/// sve gather prefetch instruction with vector plus immediate addressing mode. +static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, + unsigned ScalarSizeInBytes) { + const unsigned ImmPos = 4, OffsetPos = 3; + // No need to combine the node if the immediate is valid... + if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) + return SDValue(); + + // ...otherwise swap the offset base with the offset... + SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); + std::swap(Ops[ImmPos], Ops[OffsetPos]); + // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to + // `aarch64_sve_prfb_gather_uxtw_index`. + SDLoc DL(N); + Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, + MVT::i64); + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: + return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1rq: + return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); + case Intrinsic::aarch64_sve_ld1ro: + return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ld1: + return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldnf1: + return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1: + return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_st1: + return performST1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); case Intrinsic::aarch64_sve_ld1_gather: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SCALED_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_sxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_ld1_gather_imm: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, + AArch64ISD::GLDFF1_IMM_MERGE_ZERO); case Intrinsic::aarch64_sve_st1_scatter: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); case Intrinsic::aarch64_sve_st1_scatter_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); case Intrinsic::aarch64_sve_st1_scatter_sxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, - /*OnlyPackedOffsets=*/false); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_SXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, - /*OnlyPackedOffsets=*/false); - case Intrinsic::aarch64_sve_st1_scatter_imm: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); + return performScatterStoreCombine(N, DAG, + AArch64ISD::SST1_UXTW_SCALED_PRED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); + case Intrinsic::aarch64_sve_tuple_get: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Src1 = N->getOperand(2); + SDValue Idx = N->getOperand(3); + + uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); + EVT ResVT = N->getValueType(0); + uint64_t NumLanes = ResVT.getVectorElementCount().Min; + SDValue Val = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, + DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32)); + return DAG.getMergeValues({Val, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_set: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Tuple = N->getOperand(2); + SDValue Idx = N->getOperand(3); + SDValue Vec = N->getOperand(4); + + EVT TupleVT = Tuple.getValueType(); + uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; + + uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); + uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; + + if ((TupleLanes % NumLanes) != 0) + report_fatal_error("invalid tuple vector!"); + + uint64_t NumVecs = TupleLanes / NumLanes; + + SmallVector<SDValue, 4> Opnds; + for (unsigned I = 0; I < NumVecs; ++I) { + if (I == IdxConst) + Opnds.push_back(Vec); + else { + Opnds.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple, + DAG.getConstant(I * NumLanes, DL, MVT::i32))); + } + } + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_tuple_create2: + case Intrinsic::aarch64_sve_tuple_create3: + case Intrinsic::aarch64_sve_tuple_create4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + + SmallVector<SDValue, 4> Opnds; + for (unsigned I = 2; I < N->getNumOperands(); ++I) + Opnds.push_back(N->getOperand(I)); + + EVT VT = Opnds[0].getValueType(); + EVT EltVT = VT.getVectorElementType(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VT.getVectorElementCount() * + (N->getNumOperands() - 2)); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds); + return DAG.getMergeValues({Concat, Chain}, DL); + } + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Mask = N->getOperand(2); + SDValue BasePtr = N->getOperand(3); + SDValue LoadOps[] = {Chain, Mask, BasePtr}; + unsigned IntrinsicID = + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + SDValue Result = + LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL); + return DAG.getMergeValues({Result, Chain}, DL); + } default: break; } @@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SDLoc DL(N); SDValue Op = N->getOperand(0); - if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + if (N->getValueType(0) != MVT::i16 || + (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16)) return; Op = SDValue( @@ -12759,6 +14286,40 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { return std::make_pair(Lo, Hi); } +void AArch64TargetLowering::ReplaceExtractSubVectorResults( + SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + + // Common code will handle these just fine. + if (!InVT.isScalableVector() || !InVT.isInteger()) + return; + + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // The following checks bail if this is not a halving operation. + + ElementCount ResEC = VT.getVectorElementCount(); + + if (InVT.getVectorElementCount().Min != (ResEC.Min * 2)) + return; + + auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!CIndex) + return; + + unsigned Index = CIndex->getZExtValue(); + if ((Index != 0) && (Index != ResEC.Min)) + return; + + unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; + EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); + + SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); +} + // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); @@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); - Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); - Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, - SDValue(CmpSwap, 0))); + SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, + SDValue(CmpSwap, 0)); + Results.push_back( + DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } @@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); - Results.push_back(SDValue(CmpSwap, 0)); - Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); Results.push_back(SDValue(CmpSwap, 3)); } @@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::CTPOP: + Results.push_back(LowerCTPOP(SDValue(N, 0), DAG)); + return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; @@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.append({Pair, Result.getValue(2) /* Chain */}); return; } + case ISD::EXTRACT_SUBVECTOR: + ReplaceExtractSubVectorResults(N, Results, DAG); + return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && @@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - if (getTargetMachine().getOptLevel() == 0) + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::None; return AtomicExpansionKind::LLSC; } @@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = - Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); return OptSize && !VT.isVector(); } @@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } + +bool AArch64TargetLowering::shouldLocalize( + const MachineInstr &MI, const TargetTransformInfo *TTI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_GLOBAL_VALUE: { + // On Darwin, TLS global vars get selected into function calls, which + // we don't want localized, as they can get moved into the middle of a + // another call sequence. + const GlobalValue &GV = *MI.getOperand(1).getGlobal(); + if (GV.isThreadLocal() && Subtarget->isTargetMachO()) + return false; + break; + } + // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being + // localizable. + case AArch64::ADRP: + case AArch64::G_ADD_LOW: + return true; + default: + break; + } + return TargetLoweringBase::shouldLocalize(MI, TTI); +} + +bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { + if (isa<ScalableVectorType>(Inst.getType())) + return true; + + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) + if (isa<ScalableVectorType>(Inst.getOperand(i)->getType())) + return true; + + return false; +} + +// Return the largest legal scalable vector type that matches VT's element type. +static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE container"); + case MVT::i8: + return EVT(MVT::nxv16i8); + case MVT::i16: + return EVT(MVT::nxv8i16); + case MVT::i32: + return EVT(MVT::nxv4i32); + case MVT::i64: + return EVT(MVT::nxv2i64); + case MVT::f16: + return EVT(MVT::nxv8f16); + case MVT::f32: + return EVT(MVT::nxv4f32); + case MVT::f64: + return EVT(MVT::nxv2f64); + } +} + +// Return a PTRUE with active lanes corresponding to the extent of VT. +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + int PgPattern; + switch (VT.getVectorNumElements()) { + default: + llvm_unreachable("unexpected element count for SVE predicate"); + case 1: + PgPattern = AArch64SVEPredPattern::vl1; + break; + case 2: + PgPattern = AArch64SVEPredPattern::vl2; + break; + case 4: + PgPattern = AArch64SVEPredPattern::vl4; + break; + case 8: + PgPattern = AArch64SVEPredPattern::vl8; + break; + case 16: + PgPattern = AArch64SVEPredPattern::vl16; + break; + case 32: + PgPattern = AArch64SVEPredPattern::vl32; + break; + case 64: + PgPattern = AArch64SVEPredPattern::vl64; + break; + case 128: + PgPattern = AArch64SVEPredPattern::vl128; + break; + case 256: + PgPattern = AArch64SVEPredPattern::vl256; + break; + } + + // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can + // use AArch64SVEPredPattern::all, which can enable the use of unpredicated + // variants of instructions when available. + + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, + DAG.getTargetConstant(PgPattern, DL, MVT::i64)); +} + +static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal scalable vector!"); + auto PredTy = VT.changeVectorElementType(MVT::i1); + return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); +} + +static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { + if (VT.isFixedLengthVector()) + return getPredicateForFixedLengthVector(DAG, DL, VT); + + return getPredicateForScalableVector(DAG, DL, VT); +} + +// Grow V to consume an entire SVE register. +static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +// Convert all fixed length vector loads larger than NEON to masked_loads. +SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Load = cast<LoadSDNode>(Op); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewLoad = DAG.getMaskedLoad( + ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), + Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), + Load->getExtensionType()); + + auto Result = convertFromScalableVector(DAG, VT, NewLoad); + SDValue MergedValues[2] = {Result, Load->getChain()}; + return DAG.getMergeValues(MergedValues, DL); +} + +// Convert all fixed length vector stores larger than NEON to masked_stores. +SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Store = cast<StoreSDNode>(Op); + + SDLoc DL(Op); + EVT VT = Store->getValue().getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + return DAG.getMaskedStore( + Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); +} + +SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + SDValue Val = Op.getOperand(0); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); + Val = convertToScalableVector(DAG, ContainerVT, Val); + + // Repeatedly truncate Val until the result is of the desired element type. + switch (ContainerVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unimplemented container type"); + case MVT::nxv2i64: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); + if (VT.getVectorElementType() == MVT::i32) + break; + LLVM_FALLTHROUGH; + case MVT::nxv4i32: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); + if (VT.getVectorElementType() == MVT::i16) + break; + LLVM_FALLTHROUGH; + case MVT::nxv8i16: + Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); + Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); + assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); + break; + } + + return convertFromScalableVector(DAG, VT, Val); +} + +SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, + SelectionDAG &DAG, + unsigned NewOp) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + auto Pg = getPredicateForVector(DAG, DL, VT); + + if (useSVEForFixedLengthVectorVT(VT)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + // Create list of operands by convereting existing ones to scalable types. + SmallVector<SDValue, 4> Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + if (isa<CondCodeSDNode>(V)) { + Operands.push_back(V); + continue; + } + + assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + "Only fixed length vectors are supported!"); + Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); + } + + auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); + return convertFromScalableVector(DAG, VT, ScalableRes); + } + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + SmallVector<SDValue, 4> Operands = {Pg}; + for (const SDValue &V : Op->op_values()) { + assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) && + "Only scalable vectors are supported!"); + Operands.push_back(V); + } + + return DAG.getNode(NewOp, DL, VT, Operands); +} |