diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMInstrMVE.td')
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrMVE.td | 2090 |
1 files changed, 1462 insertions, 628 deletions
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 604291be822c4..2a1f50d97e3b3 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -10,44 +10,6 @@ // //===----------------------------------------------------------------------===// -class ExpandImmAsmOp<string shift> : AsmOperandClass { - let Name = !strconcat("ExpandImm", shift); - let PredicateMethod = !strconcat("isExpImm<", shift, ">"); - let RenderMethod = "addImmOperands"; -} -class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass { - let Name = !strconcat("InvertedExpandImm", shift, "_", size); - let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">"); - let RenderMethod = "addImmOperands"; -} - -class ExpandImm<string shift> : Operand<i32> { - let ParserMatchClass = ExpandImmAsmOp<shift>; - let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>"); - let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">"); - let PrintMethod = "printExpandedImmOperand"; -} -class InvertedExpandImm<string shift, string size> : Operand<i32> { - let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>; - let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>"); - let PrintMethod = "printExpandedImmOperand"; - // No decoder method needed, because this operand type is only used - // by aliases (VAND and VORN) -} - -def expzero00 : ExpandImm<"0">; -def expzero08 : ExpandImm<"8">; -def expzero16 : ExpandImm<"16">; -def expzero24 : ExpandImm<"24">; - -def expzero00inv16 : InvertedExpandImm<"0", "16">; -def expzero08inv16 : InvertedExpandImm<"8", "16">; - -def expzero00inv32 : InvertedExpandImm<"0", "32">; -def expzero08inv32 : InvertedExpandImm<"8", "32">; -def expzero16inv32 : InvertedExpandImm<"16", "32">; -def expzero24inv32 : InvertedExpandImm<"24", "32">; - // VPT condition mask def vpt_mask : Operand<i32> { let PrintMethod = "printVPTMask"; @@ -277,7 +239,8 @@ class mve_addr_q_shift<int shift> : MemOperand { // A family of classes wrapping up information about the vector types // used by MVE. -class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, +class MVEVectorVTInfo<ValueType vec, ValueType dblvec, + ValueType pred, ValueType dblpred, bits<2> size, string suffixletter, bit unsigned> { // The LLVM ValueType representing the vector, so we can use it in // ISel patterns. @@ -300,6 +263,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, // directly. ValueType Pred = pred; + // Same as Pred but for DblVec rather than Vec. + ValueType DblPred = dblpred; + // The most common representation of the vector element size in MVE // instruction encodings: a 2-bit value V representing an (8<<V)-bit // vector element. @@ -319,38 +285,38 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, !cast<string>(LaneBits)); // The suffix used on an instruction that mentions the whole type. - string Suffix = suffixletter ## BitsSuffix; + string Suffix = suffixletter # BitsSuffix; // The letter part of the suffix only. string SuffixLetter = suffixletter; } // Integer vector types that don't treat signed and unsigned differently. -def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>; -def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>; -def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>; -def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>; +def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>; +def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "i", ?>; +def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "i", ?>; +def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "i", ?>; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. -def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>; -def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>; -def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>; -def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>; -def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>; -def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>; -def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>; -def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>; +def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>; +def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "s", 0b0>; +def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "s", 0b0>; +def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "s", 0b0>; +def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>; +def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "u", 0b1>; +def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "u", 0b1>; +def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "u", 0b1>; // FP vector types. -def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>; -def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>; -def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>; +def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, v4i1, 0b01, "f", ?>; +def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, v4i1, 0b10, "f", ?>; +def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>; // Polynomial vector types. -def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>; -def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>; +def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>; +def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>; // --------- Start of base classes for the instructions themselves @@ -473,6 +439,8 @@ class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm, let Inst{19-17} = RdaLo{3-1}; let Inst{11-9} = RdaHi{3-1}; + + let hasSideEffects = 0; } class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16, @@ -590,6 +558,7 @@ class MVE_VABAV<string suffix, bit U, bits<2> size> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let horizontalReduction = 1; } multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> { @@ -639,38 +608,63 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; + let validForTailPredication = 1; } -multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size, - list<dag> pattern=[]> { - def acc : MVE_VADDV<"vaddva", suffix, +def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; +def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + +multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> { + def acc : MVE_VADDV<"vaddva", VTI.Suffix, (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", - 0b1, U, size, pattern>; - def no_acc : MVE_VADDV<"vaddv", suffix, + 0b1, VTI.Unsigned, VTI.Size>; + def no_acc : MVE_VADDV<"vaddv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, size, pattern>; -} + 0b0, VTI.Unsigned, VTI.Size>; -defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; -defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; -defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; -defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; -defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; -defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; + defvar InstA = !cast<Instruction>(NAME # "acc"); + defvar InstN = !cast<Instruction>(NAME # "no_acc"); -let Predicates = [HasMVEInt] in { - def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu32acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; + let Predicates = [HasMVEInt] in { + if VTI.Unsigned then { + def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } else { + def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } + def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred)), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + } } +defm MVE_VADDVs8 : MVE_VADDV_A<MVE_v16s8>; +defm MVE_VADDVs16 : MVE_VADDV_A<MVE_v8s16>; +defm MVE_VADDVs32 : MVE_VADDV_A<MVE_v4s32>; +defm MVE_VADDVu8 : MVE_VADDV_A<MVE_v16u8>; +defm MVE_VADDVu16 : MVE_VADDV_A<MVE_v8u16>; +defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>; + class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, bit A, bit U, list<dag> pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -689,21 +683,58 @@ class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; -} - -multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> { - def acc : MVE_VADDLV<"vaddlva", suffix, + let horizontalReduction = 1; +} + +def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> +]>; +def SDTVecReduceLA : SDTypeProfile<2, 3, [ // VADDLVA + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4> +]>; +def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2> +]>; +def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; + +multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> { + def acc : MVE_VADDLV<"vaddlva", VTI.Suffix, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", - 0b1, U, pattern>; - def no_acc : MVE_VADDLV<"vaddlv", suffix, + 0b1, VTI.Unsigned>; + def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, pattern>; -} + 0b0, VTI.Unsigned>; + + defvar InstA = !cast<Instruction>(NAME # "acc"); + defvar InstN = !cast<Instruction>(NAME # "no_acc"); + defvar letter = VTI.SuffixLetter; + defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>; + defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>; + defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>; + defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>; -defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; -defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; + let Predicates = [HasMVEInt] in { + def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)), + (InstN (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)), + (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>; + def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + (VTI.Pred VCCR:$pred)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred))>; + } +} + +defm MVE_VADDLVs32 : MVE_VADDLV_A<MVE_v4s32>; +defm MVE_VADDLVu32 : MVE_VADDLV_A<MVE_v4u32>; class MVE_VMINMAXNMV<string iname, string suffix, bit sz, bit bit_17, bit bit_7, list<dag> pattern=[]> @@ -724,25 +755,48 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz, let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; let Predicates = [HasMVEFloat]; + let hasSideEffects = 0; } -multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> { - def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>; - def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>; -} +multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin, + MVEVectorVTInfo VTI, string intrBaseName, + ValueType Scalar, RegisterClass ScalarReg> { + def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_intr = !cast<Intrinsic>(intrBaseName); + defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated"); -defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>; -defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>; + let Predicates = [HasMVEFloat] in { + def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec)), + ScalarReg)>; + def : Pat<(Scalar (pred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec), + (VTI.Pred VCCR:$pred))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)), + ScalarReg)>; + } +} -multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> { - def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>; - def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>; +multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin, + string intrBase> { + defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase, + f32, SPR>; + defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase, + f16, HPR>; } -defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>; -defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>; +defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 1, 1, "int_arm_mve_minnmv">; +defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 1, 0, "int_arm_mve_maxnmv">; +defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">; +defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">; class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size, bit bit_17, bit bit_7, list<dag> pattern=[]> @@ -762,33 +816,40 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size, let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; } -multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7, - MVEVectorVTInfo VTI, Intrinsic intr> { +multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin, + MVEVectorVTInfo VTI, string intrBaseName> { def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, - bit_17, bit_7>; - defvar Inst = !cast<Instruction>(NAME); + notAbs, isMin>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_intr = !cast<Intrinsic>(intrBaseName); + defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated"); + defvar base_args = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)); + defvar args = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))), + base_args); - let Predicates = [HasMVEInt] in - def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), - (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + let Predicates = [HasMVEInt] in { + def : Pat<(i32 !con(args, (unpred_intr))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; + } } -multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, - Intrinsic intr_s, Intrinsic intr_u> { - defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>; - defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>; - defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>; - defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>; - defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>; - defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>; +multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> { + defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>; + defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>; + defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>; + defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>; + defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>; + defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>; } -defm MVE_VMINV : MVE_VMINMAXV_ty< - "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; -defm MVE_VMAXV : MVE_VMINMAXV_ty< - "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; +defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">; +defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), @@ -819,14 +880,14 @@ let Predicates = [HasMVEInt] in { } -multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> { - def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>; - def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>; - def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>; +multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> { + defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>; + defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>; + defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>; } -defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; -defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; +defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">; +defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">; class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0> @@ -847,6 +908,12 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; + let horizontalReduction = 1; + // Allow tail predication for non-exchanging versions. As this is also a + // horizontalReduction, ARMLowOverheadLoops will also have to check that + // the vector operands contain zeros in their false lanes for the instruction + // to be properly valid. + let validForTailPredication = !eq(X, 0); } multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI, @@ -932,6 +999,58 @@ defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>; +def SDTVecReduce2 : SDTypeProfile<1, 2, [ // VMLAV + SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2> +]>; +def SDTVecReduce2L : SDTypeProfile<2, 2, [ // VMLALV + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3> +]>; +def SDTVecReduce2LA : SDTypeProfile<2, 4, [ // VMLALVA + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; +def ARMVMLAVs : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>; +def ARMVMLAVu : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>; +def ARMVMLALVs : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>; +def ARMVMLALVu : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>; +def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>; +def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>; + +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), + (i32 (MVE_VMLADAVu32 $src1, $src2))>; + def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))), + (i32 (MVE_VMLADAVu16 $src1, $src2))>; + def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))), + (i32 (MVE_VMLADAVu8 $src1, $src2))>; + def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>; + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>; + def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>; + def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; +} + // vmlav aliases vmladav foreach acc = ["", "a"] in { foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { @@ -963,6 +1082,14 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; + let horizontalReduction = 1; + // Allow tail predication for non-exchanging versions. As this is also a + // horizontalReduction, ARMLowOverheadLoops will also have to check that + // the vector operands contain zeros in their false lanes for the instruction + // to be properly valid. + let validForTailPredication = !eq(X, 0); + + let hasSideEffects = 0; } multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix, @@ -1023,6 +1150,26 @@ multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> { defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>; defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + def : Pat<(ARMVMLALVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + + def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; +} + // vmlalv aliases vmlaldav foreach acc = ["", "a"] in { foreach suffix = ["s16", "s32", "u16", "u32"] in { @@ -1244,28 +1391,29 @@ let Predicates = [HasMVEInt] in { (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>; } -let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), - (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; - def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), - (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>; - def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>; +multiclass MVE_VREV_basic_patterns<int revbits, list<MVEVectorVTInfo> VTIs, + Instruction Inst> { + defvar unpred_op = !cast<SDNode>("ARMvrev" # revbits); - def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))), - (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>; - def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>; + foreach VTI = VTIs in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src)))>; + def : Pat<(VTI.Vec (int_arm_mve_vrev_predicated (VTI.Vec MQPR:$src), + revbits, (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } +} + +let Predicates = [HasMVEInt] in { + defm: MVE_VREV_basic_patterns<64, [MVE_v4i32, MVE_v4f32], MVE_VREV64_32>; + defm: MVE_VREV_basic_patterns<64, [MVE_v8i16, MVE_v8f16], MVE_VREV64_16>; + defm: MVE_VREV_basic_patterns<64, [MVE_v16i8 ], MVE_VREV64_8>; - def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>; + defm: MVE_VREV_basic_patterns<32, [MVE_v8i16, MVE_v8f16], MVE_VREV32_16>; + defm: MVE_VREV_basic_patterns<32, [MVE_v16i8 ], MVE_VREV32_8>; - def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))), - (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>; - def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))), - (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>; - def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))), - (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>; + defm: MVE_VREV_basic_patterns<16, [MVE_v16i8 ], MVE_VREV16_8>; } def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), @@ -1280,14 +1428,14 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), } let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))), - (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>; - def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))), - (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>; - def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))), - (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>; - def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))), - (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>; + foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in { + def : Pat<(VTI.Vec (vnotq (VTI.Vec MQPR:$val1))), + (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1)))>; + def : Pat<(VTI.Vec (int_arm_mve_mvn_predicated (VTI.Vec MQPR:$val1), + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } } class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28> @@ -1383,10 +1531,10 @@ defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>; defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>; defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>; -class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> +class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps> : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary, iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> { - bits<8> imm; + bits<12> imm; bits<4> Qd; let Inst{28} = imm{7}; @@ -1396,66 +1544,59 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> let Inst{18-16} = imm{6-4}; let Inst{15-13} = Qd{2-0}; let Inst{12} = 0b0; - let Inst{11-8} = cmode; + let Inst{11} = halfword; + let Inst{10} = !if(halfword, 0, imm{10}); + let Inst{9} = imm{9}; + let Inst{8} = 0b1; let Inst{7-6} = 0b01; let Inst{4} = 0b1; let Inst{3-0} = imm{3-0}; } -class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type> - : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { - let Inst{5} = 0b0; - let validForTailPredication = 1; -} +multiclass MVE_bit_cmode_p<string iname, bit opcode, + MVEVectorVTInfo VTI, Operand imm_type, SDNode op> { + def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0}, + (ins MQPR:$Qd_src, imm_type:$imm)> { + let Inst{5} = opcode; + let validForTailPredication = 1; + } -def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; -def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>; -def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>; -def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>; -def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>; -def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>; - -def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; + defvar Inst = !cast<Instruction>(NAME); + defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm)); -def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", - (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; + let Predicates = [HasMVEInt] in { + def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + UnpredPat, (VTI.Vec MQPR:$src))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm, + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; + } +} -class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type> - : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { - let Inst{5} = 0b1; - let validForTailPredication = 1; +multiclass MVE_VORRimm<MVEVectorVTInfo VTI, Operand imm_type> { + defm "": MVE_bit_cmode_p<"vorr", 0, VTI, imm_type, ARMvorrImm>; +} +multiclass MVE_VBICimm<MVEVectorVTInfo VTI, Operand imm_type> { + defm "": MVE_bit_cmode_p<"vbic", 1, VTI, imm_type, ARMvbicImm>; } -def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; -def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>; -def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>; -def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>; -def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>; -def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>; - -def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +defm MVE_VORRimmi16 : MVE_VORRimm<MVE_v8i16, nImmSplatI16>; +defm MVE_VORRimmi32 : MVE_VORRimm<MVE_v4i32, nImmSplatI32>; +defm MVE_VBICimmi16 : MVE_VBICimm<MVE_v8i16, nImmSplatI16>; +defm MVE_VBICimmi32 : MVE_VBICimm<MVE_v4i32, nImmSplatI32>; + +def MVE_VORNimmi16 : MVEInstAlias<"vorn${vp}.i16\t$Qd, $imm", + (MVE_VORRimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>; +def MVE_VORNimmi32 : MVEInstAlias<"vorn${vp}.i32\t$Qd, $imm", + (MVE_VORRimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>; + +def MVE_VANDimmi16 : MVEInstAlias<"vand${vp}.i16\t$Qd, $imm", + (MVE_VBICimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>; +def MVE_VANDimmi32 : MVEInstAlias<"vand${vp}.i32\t$Qd, $imm", + (MVE_VBICimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>; + +def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", + (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; class MVE_VMOV_lane_direction { bit bit_20; @@ -1494,6 +1635,8 @@ class MVE_VMOV_lane<string suffix, bit U, dag indexop, let Inst{11-8} = 0b1011; let Inst{7} = Qd{3}; let Inst{4-0} = 0b10000; + + let hasSideEffects = 0; } class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir> @@ -1557,10 +1700,14 @@ let Predicates = [HasMVEInt] in { (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; def : Pat<(v16i8 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; @@ -1575,8 +1722,8 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; - def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), - (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; + def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), @@ -1588,8 +1735,8 @@ let Predicates = [HasMVEInt] in { (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; def : Pat<(v4f32 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; - def : Pat<(v8f16 (scalar_to_vector HPR:$src)), - (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>; + def : Pat<(v8f16 (scalar_to_vector (f16 HPR:$src))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>; def : Pat<(v8f16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; } @@ -1882,6 +2029,26 @@ class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let validForTailPredication = 1; } +def addnuw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def addnsw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + +def subnuw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def subnsw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int> { def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>; @@ -1913,6 +2080,37 @@ defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>; defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>; defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>; +// Rounding Halving Add perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add to ensure that the extra bit of information is not needed for the +// arithmetic or the rounding. +def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; + + class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, bits<2> size, list<dag> pattern=[]> : MVE_int<iname, suffix, size, pattern> { @@ -1936,7 +2134,8 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size, : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, - SDNode unpred_op, Intrinsic pred_int> { + SDNode unpred_op, Intrinsic pred_int, PatFrag add_op, + SDNode shift_op> { def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); @@ -1945,6 +2144,9 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + // Predicated add-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), @@ -1954,18 +2156,24 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, } } -multiclass MVE_VHADD<MVEVectorVTInfo VTI> - : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>; +multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> + : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op, + shift_op>; -defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>; -defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>; -defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>; -defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>; -defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>; -defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>; +// Halving add/sub perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add/sub to ensure that the extra bit of information is not needed. +defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>; +defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>; +defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>; +defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>; +defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>; +defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>; multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, - SDNode unpred_op, Intrinsic pred_int> { + SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op, + SDNode shift_op> { def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); @@ -1975,6 +2183,10 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + + // Predicated subtract-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), @@ -1985,15 +2197,16 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, } } -multiclass MVE_VHSUB<MVEVectorVTInfo VTI> - : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>; +multiclass MVE_VHSUB<MVEVectorVTInfo VTI, PatFrag sub_op, SDNode shift_op> + : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, sub_op, + shift_op>; -defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>; -defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>; -defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>; -defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>; -defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>; -defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>; +defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8, subnsw, ARMvshrsImm>; +defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16, subnsw, ARMvshrsImm>; +defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32, subnsw, ARMvshrsImm>; +defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>; +defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>; +defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>; class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, @@ -2028,24 +2241,37 @@ let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - // For the 16-bit and 8-bit vduplanes we don't care about the signedness - // of the lane move operation as we only want the lowest 8/16 bits anyway. - def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; - def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), - (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; - - def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), - (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; - def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), - (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; + // Match a vselect with an ARMvdup as a predicated MVE_VDUP + def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), + (v16i8 (ARMvdup (i32 rGPR:$elem))), + (v16i8 MQPR:$inactive))), + (MVE_VDUP8 rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred), + (v16i8 MQPR:$inactive))>; + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), + (v8i16 (ARMvdup (i32 rGPR:$elem))), + (v8i16 MQPR:$inactive))), + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8i16 MQPR:$inactive))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), + (v4i32 (ARMvdup (i32 rGPR:$elem))), + (v4i32 MQPR:$inactive))), + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4i32 MQPR:$inactive))>; + def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), + (v4f32 (ARMvdup (i32 rGPR:$elem))), + (v4f32 MQPR:$inactive))), + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4f32 MQPR:$inactive))>; + def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), + (v8f16 (ARMvdup (i32 rGPR:$elem))), + (v8f16 MQPR:$inactive))), + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8f16 MQPR:$inactive))>; } @@ -2079,32 +2305,43 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; -def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>; -def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>; +multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI, + SDNode unpred_op> { + def "": MVE_VCLSCLZ<"v"#opname, VTI.Suffix, VTI.Size, opcode>; -def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; -def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; -def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_"#opname#"_predicated"); -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))), - (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>; - def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))), - (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>; - def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))), - (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>; + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } } +defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>; +defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>; +defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>; + +defm MVE_VCLZs8 : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>; +defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>; +defm MVE_VCLZs32 : MVE_VCLSCLZ_p<"clz", 1, MVE_v4i32, ctlz>; + class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, - list<dag> pattern=[]> + bit saturate, list<dag> pattern=[]> : MVEIntSingleSrc<iname, suffix, size, pattern> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; - let Inst{17-16} = 0b01; - let Inst{12-8} = 0b00011; + let Inst{17} = 0b0; + let Inst{16} = !eq(saturate, 0); + let Inst{12-11} = 0b00; + let Inst{10} = saturate; + let Inst{9-8} = 0b11; let Inst{7} = negate; let Inst{6} = 0b1; let Inst{4} = 0b0; @@ -2112,61 +2349,40 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, let validForTailPredication = 1; } -def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; -def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>; -def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (abs (v16i8 MQPR:$v))), - (v16i8 (MVE_VABSs8 $v))>; - def : Pat<(v8i16 (abs (v8i16 MQPR:$v))), - (v8i16 (MVE_VABSs16 $v))>; - def : Pat<(v4i32 (abs (v4i32 MQPR:$v))), - (v4i32 (MVE_VABSs32 $v))>; -} +multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate, + SDNode unpred_op, Intrinsic pred_int, + MVEVectorVTInfo VTI> { + def "" : MVE_VABSNEG_int<iname, VTI.Suffix, VTI.Size, negate, saturate>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>; -def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>; -def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>; + let Predicates = [HasMVEInt] in { + // VQABS and VQNEG have more difficult isel patterns defined elsewhere + if !eq(saturate, 0) then { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>; + } -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))), - (v16i8 (MVE_VNEGs8 $v))>; - def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))), - (v8i16 (MVE_VNEGs16 $v))>; - def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))), - (v4i32 (MVE_VNEGs32 $v))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>; + } } -class MVE_VQABSNEG<string iname, string suffix, bits<2> size, - bit negate, list<dag> pattern=[]> - : MVEIntSingleSrc<iname, suffix, size, pattern> { - - let Inst{28} = 0b1; - let Inst{25-23} = 0b111; - let Inst{21-20} = 0b11; - let Inst{17-16} = 0b00; - let Inst{12-8} = 0b00111; - let Inst{7} = negate; - let Inst{6} = 0b1; - let Inst{4} = 0b0; - let Inst{0} = 0b0; - let validForTailPredication = 1; +foreach VTI = [ MVE_v16s8, MVE_v8s16, MVE_v4s32 ] in { + defm "MVE_VABS" # VTI.Suffix : MVE_VABSNEG_int_m< + "vabs", 0, 0, abs, int_arm_mve_abs_predicated, VTI>; + defm "MVE_VQABS" # VTI.Suffix : MVE_VABSNEG_int_m< + "vqabs", 0, 1, ?, int_arm_mve_qabs_predicated, VTI>; + defm "MVE_VNEG" # VTI.Suffix : MVE_VABSNEG_int_m< + "vneg", 1, 0, vnegq, int_arm_mve_neg_predicated, VTI>; + defm "MVE_VQNEG" # VTI.Suffix : MVE_VABSNEG_int_m< + "vqneg", 1, 1, ?, int_arm_mve_qneg_predicated, VTI>; } -def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; -def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>; -def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>; - -def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>; -def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>; -def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>; - // int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times // zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max, - dag zero_vec, MVE_VQABSNEG vqabs_instruction, - MVE_VQABSNEG vqneg_instruction> { + dag zero_vec, MVE_VABSNEG_int vqabs_instruction, + MVE_VABSNEG_int vqneg_instruction> { let Predicates = [HasMVEInt] in { // The below tree can be replaced by a vqabs instruction, as it represents // the following vectorized expression (r being the value in $reg): @@ -2257,6 +2473,8 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>; def : Pat<(v4i32 (ARMvmovImm timm:$simm)), (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>; + def : Pat<(v2i64 (ARMvmovImm timm:$simm)), + (v2i64 (MVE_VMOVimmi64 nImmSplatI64:$simm))>; def : Pat<(v8i16 (ARMvmvnImm timm:$simm)), (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>; @@ -2265,6 +2483,15 @@ let Predicates = [HasMVEInt] in { def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)), (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>; + + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; } class MVE_VMINMAXA<string iname, string suffix, bits<2> size, @@ -2291,13 +2518,37 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>; -def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>; -def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>; +multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, bit bit_12> { + def "" : MVE_VMINMAXA<iname, VTI.Suffix, VTI.Size, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated v(min|max)a + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (abs (VTI.Vec MQPR:$Qm)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>; + + // Predicated v(min|max)a + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } +} + +multiclass MVE_VMINA<MVEVectorVTInfo VTI> + : MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>; + +defm MVE_VMINAs8 : MVE_VMINA<MVE_v16s8>; +defm MVE_VMINAs16 : MVE_VMINA<MVE_v8s16>; +defm MVE_VMINAs32 : MVE_VMINA<MVE_v4s32>; -def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>; -def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>; -def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>; +multiclass MVE_VMAXA<MVEVectorVTInfo VTI> + : MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>; + +defm MVE_VMAXAs8 : MVE_VMAXA<MVE_v16s8>; +defm MVE_VMAXAs16 : MVE_VMAXA<MVE_v8s16>; +defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>; // end of MVE Integer instructions @@ -2334,7 +2585,7 @@ class MVE_shift_imm<dag oops, dag iops, string iname, string suffix, let Inst{3-1} = Qm{2-0}; } -class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, +class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top, list<dag> pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, suffix, "$Qd, $Qm", vpred_r, "", @@ -2344,25 +2595,36 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, let Inst{21} = 0b1; let Inst{20-19} = sz{1-0}; let Inst{18-16} = 0b000; + let Inst{12} = top; let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = 0b0; + let doubleWidthResult = 1; } -multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U, - list<dag> pattern=[]> { - def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> { - let Inst{12} = 0b0; - } - def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> { - let Inst{12} = 0b1; - } +multiclass MVE_VMOVL_m<bit top, string chr, MVEVectorVTInfo OutVTI, + MVEVectorVTInfo InVTI> { + def "": MVE_VMOVL<"vmovl" # chr, InVTI.Suffix, OutVTI.Size, + InVTI.Unsigned, top>; + defvar Inst = !cast<Instruction>(NAME); + + def : Pat<(OutVTI.Vec (int_arm_mve_vmovl_predicated (InVTI.Vec MQPR:$src), + (i32 InVTI.Unsigned), (i32 top), + (OutVTI.Pred VCCR:$pred), + (OutVTI.Vec MQPR:$inactive))), + (OutVTI.Vec (Inst (InVTI.Vec MQPR:$src), ARMVCCThen, + (OutVTI.Pred VCCR:$pred), + (OutVTI.Vec MQPR:$inactive)))>; } -defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>; -defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>; -defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>; -defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>; +defm MVE_VMOVLs8bh : MVE_VMOVL_m<0, "b", MVE_v8s16, MVE_v16s8>; +defm MVE_VMOVLs8th : MVE_VMOVL_m<1, "t", MVE_v8s16, MVE_v16s8>; +defm MVE_VMOVLu8bh : MVE_VMOVL_m<0, "b", MVE_v8u16, MVE_v16u8>; +defm MVE_VMOVLu8th : MVE_VMOVL_m<1, "t", MVE_v8u16, MVE_v16u8>; +defm MVE_VMOVLs16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8s16>; +defm MVE_VMOVLs16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8s16>; +defm MVE_VMOVLu16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8u16>; +defm MVE_VMOVLu16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8u16>; let Predicates = [HasMVEInt] in { def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16), @@ -2372,12 +2634,23 @@ let Predicates = [HasMVEInt] in { def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8), (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>; + def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8), + (MVE_VMOVLs8th MQPR:$src)>; + def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16), + (MVE_VMOVLs16th MQPR:$src)>; + + // zext_inreg 8 -> 16 + def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)), + (MVE_VMOVLu8bh MQPR:$src)>; // zext_inreg 16 -> 32 def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), (MVE_VMOVLu16bh MQPR:$src)>; - // zext_inreg 8 -> 16 - def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))), - (MVE_VMOVLu8bh MQPR:$src)>; + // Same zext_inreg with vrevs, picking the top half + def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)), + (MVE_VMOVLu8th MQPR:$src)>; + def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (MVE_VMOVLu16th MQPR:$src)>; } @@ -2395,6 +2668,8 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th, // For the MVE_VSHLL_patterns multiclass to refer to Operand immediateType = immtype; + + let doubleWidthResult = 1; } // The immediate VSHLL instructions accept shift counts from 1 up to @@ -2438,6 +2713,7 @@ class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size, let Inst{11-6} = 0b111000; let Inst{4} = 0b0; let Inst{0} = 0b1; + let doubleWidthResult = 1; } multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U, @@ -2472,17 +2748,17 @@ multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> { def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm, (i32 VTI.Unsigned), (i32 top), - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive))), (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm, - ARMVCCThen, (VTI.Pred VCCR:$mask), + ARMVCCThen, (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits), (i32 VTI.Unsigned), (i32 top), - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive))), (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen, - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; } @@ -2509,6 +2785,8 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28, let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b1; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> { @@ -2550,6 +2828,8 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN< @@ -2598,6 +2878,8 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12, let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = bit_0; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> { @@ -3131,41 +3413,34 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size, } -multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> { - def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>; - def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>; - def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>; - def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>; - def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>; - def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>; -} +multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode, + SDNode unpred_op> { + def "": MVE_VRINT<suffix, opcode, VTI.Suffix, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_vrint"#suffix#"_predicated"); -defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>; -defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>; + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } +} -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>; +multiclass MVE_VRINT_ops<MVEVectorVTInfo VTI> { + defm N : MVE_VRINT_m<VTI, "n", 0b000, int_arm_mve_vrintn>; + defm X : MVE_VRINT_m<VTI, "x", 0b001, frint>; + defm A : MVE_VRINT_m<VTI, "a", 0b010, fround>; + defm Z : MVE_VRINT_m<VTI, "z", 0b011, ftrunc>; + defm M : MVE_VRINT_m<VTI, "m", 0b101, ffloor>; + defm P : MVE_VRINT_m<VTI, "p", 0b111, fceil>; } +defm MVE_VRINTf16 : MVE_VRINT_ops<MVE_v8f16>; +defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>; + class MVEFloatArithNeon<string iname, string suffix, bit size, dag oops, dag iops, string ops, vpred_ops vpred, string cstr, list<dag> pattern=[]> @@ -3281,29 +3556,40 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4, let Inst{8} = bit_8; let Inst{7} = Qn{3}; let Inst{4} = bit_4; + let validForTailPredication = 1; } -def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; - -def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; +multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> { + def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = int_arm_mve_fma_predicated; + defvar m1 = (VTI.Vec MQPR:$m1); + defvar m2 = (VTI.Vec MQPR:$m2); + defvar add = (VTI.Vec MQPR:$add); + defvar pred = (VTI.Pred VCCR:$pred); -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), - (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; - def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), - (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; - def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), - (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>; - def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), - (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>; + let Predicates = [HasMVEFloat] in { + if fms then { + def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + } else { + def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + } + } } +defm MVE_VFMAf32 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v4f32>; +defm MVE_VFMAf16 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v8f16>; +defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>; +defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>; + multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int> { def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> { @@ -3423,10 +3709,10 @@ defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>; defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, - Operand imm_operand_type, list<dag> pattern=[]> + Operand imm_operand_type> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6), - "$Qd, $Qm, $imm6", vpred_r, "", pattern> { + "$Qd, $Qm, $imm6", vpred_r, "", []> { bits<4> Qd; bits<6> imm6; @@ -3468,14 +3754,43 @@ class MVE_VCVT_fix_f16<string suffix, bit U, bit op> let Inst{20} = 0b1; } -def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>; -def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>; -def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>; -def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>; -def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>; -def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>; -def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>; -def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>; +multiclass MVE_VCVT_fix_patterns<Instruction Inst, bit U, MVEVectorVTInfo DestVTI, + MVEVectorVTInfo SrcVTI> { + let Predicates = [HasMVEFloat] in { + def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix + (i32 U), (SrcVTI.Vec MQPR:$Qm), imm:$scale)), + (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale))>; + def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix_predicated (i32 U), + (DestVTI.Vec MQPR:$inactive), + (SrcVTI.Vec MQPR:$Qm), + imm:$scale, + (DestVTI.Pred VCCR:$mask))), + (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale, + ARMVCCThen, (DestVTI.Pred VCCR:$mask), + (DestVTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VCVT_fix_f32_m<bit U, bit op, + MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> { + def "" : MVE_VCVT_fix_f32<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>; + defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>; +} + +multiclass MVE_VCVT_fix_f16_m<bit U, bit op, + MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> { + def "" : MVE_VCVT_fix_f16<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>; + defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>; +} + +defm MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16_m<0b0, 0b0, MVE_v8f16, MVE_v8s16>; +defm MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16_m<0b0, 0b1, MVE_v8s16, MVE_v8f16>; +defm MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16_m<0b1, 0b0, MVE_v8f16, MVE_v8u16>; +defm MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16_m<0b1, 0b1, MVE_v8u16, MVE_v8f16>; +defm MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32_m<0b0, 0b0, MVE_v4f32, MVE_v4s32>; +defm MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32_m<0b0, 0b1, MVE_v4s32, MVE_v4f32>; +defm MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32_m<0b1, 0b0, MVE_v4f32, MVE_v4u32>; +defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>; class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, bits<2> rm, list<dag> pattern=[]> @@ -3497,23 +3812,44 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, let validForTailPredication = 1; } -multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op, - list<dag> pattern=[]> { - def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>; - def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>; - def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>; - def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>; +multiclass MVE_VCVT_fp_int_anpm_inner<MVEVectorVTInfo Int, MVEVectorVTInfo Flt, + string anpm, bits<2> rm> { + def "": MVE_VCVT_fp_int_anpm<Int.Suffix # "." # Flt.Suffix, Int.Size, + Int.Unsigned, anpm, rm>; + + defvar Inst = !cast<Instruction>(NAME); + defvar IntrBaseName = "int_arm_mve_vcvt" # anpm; + defvar UnpredIntr = !cast<Intrinsic>(IntrBaseName); + defvar PredIntr = !cast<Intrinsic>(IntrBaseName # "_predicated"); + + let Predicates = [HasMVEFloat] in { + def : Pat<(Int.Vec (UnpredIntr (i32 Int.Unsigned), (Flt.Vec MQPR:$in))), + (Int.Vec (Inst (Flt.Vec MQPR:$in)))>; + + def : Pat<(Int.Vec (PredIntr (i32 Int.Unsigned), (Int.Vec MQPR:$inactive), + (Flt.Vec MQPR:$in), (Flt.Pred VCCR:$pred))), + (Int.Vec (Inst (Flt.Vec MQPR:$in), ARMVCCThen, + (Flt.Pred VCCR:$pred), (Int.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VCVT_fp_int_anpm_outer<MVEVectorVTInfo Int, + MVEVectorVTInfo Flt> { + defm a : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "a", 0b00>; + defm n : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "n", 0b01>; + defm p : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "p", 0b10>; + defm m : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "m", 0b11>; } // This defines instructions such as MVE_VCVTu16f16a, with an explicit // rounding-mode suffix on the mnemonic. The class below will define // the bare MVE_VCVTu16f16 (with implied rounding toward zero). -defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>; -defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>; -defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>; -defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>; +defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8s16, MVE_v8f16>; +defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8u16, MVE_v8f16>; +defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4s32, MVE_v4f32>; +defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>; -class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, +class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned, list<dag> pattern=[]> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> { @@ -3527,41 +3863,43 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, let Inst{17-16} = 0b11; let Inst{15-13} = Qd{2-0}; let Inst{12-9} = 0b0011; - let Inst{8-7} = op; + let Inst{8} = toint; + let Inst{7} = unsigned; let Inst{4} = 0b0; let validForTailPredication = 1; } +multiclass MVE_VCVT_fp_int_m<MVEVectorVTInfo Dest, MVEVectorVTInfo Src, + SDNode unpred_op> { + defvar Unsigned = !or(!eq(Dest.SuffixLetter,"u"), !eq(Src.SuffixLetter,"u")); + defvar ToInt = !eq(Src.SuffixLetter,"f"); + + def "" : MVE_VCVT_fp_int<Dest.Suffix # "." # Src.Suffix, Dest.Size, + ToInt, Unsigned>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(Dest.Vec (unpred_op (Src.Vec MQPR:$src))), + (Dest.Vec (Inst (Src.Vec MQPR:$src)))>; + def : Pat<(Dest.Vec (int_arm_mve_vcvt_fp_int_predicated + (Src.Vec MQPR:$src), (i32 Unsigned), + (Src.Pred VCCR:$mask), (Dest.Vec MQPR:$inactive))), + (Dest.Vec (Inst (Src.Vec MQPR:$src), ARMVCCThen, + (Src.Pred VCCR:$mask), + (Dest.Vec MQPR:$inactive)))>; + } +} // The unsuffixed VCVT for float->int implicitly rounds toward zero, // which I reflect here in the llvm instruction names -def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>; -def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>; -def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>; -def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>; +defm MVE_VCVTs16f16z : MVE_VCVT_fp_int_m<MVE_v8s16, MVE_v8f16, fp_to_sint>; +defm MVE_VCVTu16f16z : MVE_VCVT_fp_int_m<MVE_v8u16, MVE_v8f16, fp_to_uint>; +defm MVE_VCVTs32f32z : MVE_VCVT_fp_int_m<MVE_v4s32, MVE_v4f32, fp_to_sint>; +defm MVE_VCVTu32f32z : MVE_VCVT_fp_int_m<MVE_v4u32, MVE_v4f32, fp_to_uint>; // Whereas VCVT for int->float rounds to nearest -def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>; -def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>; -def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>; -def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>; - -let Predicates = [HasMVEFloat] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))), - (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>; - def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))), - (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>; - def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))), - (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>; - def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))), - (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>; - def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))), - (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>; - def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))), - (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>; - def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))), - (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>; - def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))), - (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>; -} +defm MVE_VCVTf16s16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8s16, sint_to_fp>; +defm MVE_VCVTf16u16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8u16, uint_to_fp>; +defm MVE_VCVTf32s32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4s32, sint_to_fp>; +defm MVE_VCVTf32u32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4u32, uint_to_fp>; class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, list<dag> pattern=[]> @@ -3582,26 +3920,29 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, let validForTailPredication = 1; } -def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; -def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>; - -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fabs MQPR:$src)), - (MVE_VABSf16 MQPR:$src)>; - def : Pat<(v4f32 (fabs MQPR:$src)), - (MVE_VABSf32 MQPR:$src)>; -} +multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int, + MVEVectorVTInfo VTI, bit opcode> { + def "" : MVE_VABSNEG_fp<iname, VTI.Suffix, VTI.Size, opcode>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>; -def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>; + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>; -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fneg MQPR:$src)), - (MVE_VNEGf16 MQPR:$src)>; - def : Pat<(v4f32 (fneg MQPR:$src)), - (MVE_VNEGf32 MQPR:$src)>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>; + } } +defm MVE_VABSf16 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated, + MVE_v8f16, 0>; +defm MVE_VABSf32 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated, + MVE_v4f32, 0>; +defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated, + MVE_v8f16, 1>; +defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated, + MVE_v4f32, 1>; + class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12, list<dag> pattern=[]> : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), @@ -3623,11 +3964,37 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12, let Inst{0} = 0b1; } -def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>; -def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>; +multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, + bit bit_12> { + def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated v(max|min)nma + def : Pat<(VTI.Vec (unpred_op (fabs (VTI.Vec MQPR:$Qd)), + (fabs (VTI.Vec MQPR:$Qm)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>; + + // Predicated v(max|min)nma + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } +} + +multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12> + : MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>; + +defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>; +defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>; -def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>; -def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>; +multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12> + : MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>; + +defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>; +defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>; // end of MVE Floating Point instructions @@ -3796,12 +4163,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> { def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; - def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))), (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; @@ -3810,12 +4177,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> { def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_z<PatLeaf fc> { @@ -3825,31 +4192,31 @@ multiclass unpred_vcmpf_z<PatLeaf fc> { (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))), - (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_r<int fc> { - def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), - (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; - def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), - (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; - def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; - def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { @@ -3889,7 +4256,7 @@ let Predicates = [HasMVEFloat] in { } -// Extra "worst case" and/or/xor partterns, going into and out of GRP +// Extra "worst case" and/or/xor patterns, going into and out of GRP multiclass two_predops<SDPatternOperator opnode, Instruction insn> { def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))), (v16i1 (COPY_TO_REGCLASS @@ -3918,7 +4285,6 @@ let Predicates = [HasMVEInt] in { // example when moving between rGPR and VPR.P0 as part of predicate vector // shuffles. We also sometimes need to cast between different predicate // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. - def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; let Predicates = [HasMVEInt] in { @@ -3932,6 +4298,16 @@ let Predicates = [HasMVEInt] in { def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } + + // Here we match the specific SDNode type 'ARMVectorRegCastImpl' + // rather than the more general 'ARMVectorRegCast' which would also + // match some bitconverts. If we use the latter in cases where the + // input and output types are the same, the bitconvert gets elided + // and we end up generating a nonsense match of nothing. + + foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>; } // end of MVE compares @@ -3973,11 +4349,32 @@ class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract, let Inst{0} = round; } +multiclass MVE_VQxDMLxDH_p<string iname, bit exch, bit round, bit subtract, + MVEVectorVTInfo VTI> { + def "": MVE_VQxDMLxDH<iname, exch, round, subtract, VTI.Suffix, VTI.Size, + !if(!eq(VTI.LaneBits, 32), ",@earlyclobber $Qd", "")>; + defvar Inst = !cast<Instruction>(NAME); + defvar ConstParams = (? (i32 exch), (i32 round), (i32 subtract)); + defvar unpred_intr = int_arm_mve_vqdmlad; + defvar pred_intr = int_arm_mve_vqdmlad_predicated; + + def : Pat<(VTI.Vec !con((unpred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams)), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)))>; + def : Pat<(VTI.Vec !con((pred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams, + (? (VTI.Pred VCCR:$pred)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; +} + multiclass MVE_VQxDMLxDH_multi<string iname, bit exch, bit round, bit subtract> { - def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>; - def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>; - def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">; + defm s8 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v16s8>; + defm s16 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v8s16>; + defm s32 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v4s32>; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; @@ -4051,6 +4448,7 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, let Inst{7} = Qn{3}; let Inst{0} = 0b0; let validForTailPredication = 1; + let doubleWidthResult = 1; } multiclass MVE_VMULL_m<MVEVectorVTInfo VTI, @@ -4072,10 +4470,10 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI, // Predicated multiply def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), - uflag, (? (i32 Top), (VTI.Pred VCCR:$mask), + uflag, (? (i32 Top), (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))), (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), + ARMVCCThen, (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; } } @@ -4122,6 +4520,50 @@ defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, int_arm_mve_mull_poly_predicated, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16), + (sext_inreg (v4i32 MQPR:$src2), v4i16)), + (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16), + (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)), + (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8), + (sext_inreg (v8i16 MQPR:$src2), v8i8)), + (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8), + (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)), + (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), + (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)), + (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))), + (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)), + (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))), + (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>; +} + class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), @@ -4195,6 +4637,8 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17, let Inst{8} = 0b0; let Inst{7} = !if(!eq(bit_17, 0), 1, 0); let Inst{0} = 0b1; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } multiclass MVE_VxMOVxN_halves<string iname, string suffix, @@ -4213,21 +4657,121 @@ defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; + +multiclass MVE_VMOVN_p<Instruction Inst, bit top, + MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> { + // Match the most obvious MVEvmovn(a,b,t), which overwrites the odd or even + // lanes of a (depending on t) with the even lanes of b. + def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qm), (i32 top))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>; + + if !eq(top, 0) then { + // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd + // lanes of a with the odd lanes of b. In other words, the lanes we're + // _keeping_ from a are the even ones. So we can flip it round and say that + // this is the same as overwriting the even lanes of b with the even lanes + // of a, i.e. it's a VMOVNB with the operands reversed. + defvar vrev = !cast<SDNode>("ARMvrev" # InVTI.LaneBits); + def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qm), + (VTI.Vec (vrev MQPR:$Qd_src)), (i32 1))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>; + } + + // Match the IR intrinsic for a predicated VMOVN. This regards the Qm input + // as having wider lanes that we're narrowing, instead of already-narrow + // lanes that we're taking every other one of. + def : Pat<(VTI.Vec (int_arm_mve_vmovn_predicated (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), (i32 top), + (InVTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + ARMVCCThen, (InVTI.Pred VCCR:$pred)))>; +} + +defm : MVE_VMOVN_p<MVE_VMOVNi32bh, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VMOVN_p<MVE_VMOVNi32th, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VMOVN_p<MVE_VMOVNi16bh, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VMOVN_p<MVE_VMOVNi16th, 1, MVE_v16i8, MVE_v8i16>; + +multiclass MVE_VQMOVN_p<Instruction Inst, bit outU, bit inU, bit top, + MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> { + def : Pat<(VTI.Vec (int_arm_mve_vqmovn (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + (i32 outU), (i32 inU), (i32 top))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm)))>; + + def : Pat<(VTI.Vec (int_arm_mve_vqmovn_predicated (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + (i32 outU), (i32 inU), (i32 top), + (InVTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + ARMVCCThen, (InVTI.Pred VCCR:$pred)))>; +} + +defm : MVE_VQMOVN_p<MVE_VQMOVNs32bh, 0, 0, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs32th, 0, 0, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs16bh, 0, 0, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs16th, 0, 0, 1, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu32bh, 1, 1, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu32th, 1, 1, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu16bh, 1, 1, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu16th, 1, 1, 1, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs32bh, 1, 0, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs32th, 1, 0, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs16bh, 1, 0, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>; + +def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisVT<3, i32>]>; +def MVEvqmovns : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>; +def MVEvqmovnu : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>; + let Predicates = [HasMVEInt] in { - def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), - (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; - def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), - (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; - def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), - (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; - def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), - (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))), + (v8i16 (MVE_VQSHRNbhs32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))), + (v16i8 (MVE_VQSHRNbhs16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))), + (v8i16 (MVE_VQSHRNths32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))), + (v16i8 (MVE_VQSHRNths16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))), + (v8i16 (MVE_VQSHRNbhu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))), + (v16i8 (MVE_VQSHRNbhu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))), + (v8i16 (MVE_VQSHRNthu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))), + (v16i8 (MVE_VQSHRNthu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; } class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, - list<dag> pattern=[]> - : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), - "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> { + dag iops_extra, vpred_ops vpred, string cstr> + : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), + !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm", + vpred, cstr, []> { let Inst{28} = op; let Inst{21-16} = 0b111111; let Inst{12} = T; @@ -4235,10 +4779,17 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, let Inst{0} = 0b1; let Predicates = [HasMVEFloat]; + let retainsPreviousHalfElement = 1; } +def SDTARMVCVTL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>; +def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>; +def MVEvcvtl : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>; + multiclass MVE_VCVT_f2h_m<string iname, int half> { - def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>; + def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { @@ -4250,11 +4801,28 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> { (v4i1 VCCR:$mask))), (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), ARMVCCThen, (v4i1 VCCR:$mask)))>; + + def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), + (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; } } multiclass MVE_VCVT_h2f_m<string iname, int half> { - def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>; + def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; + def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated + (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half), + (v4i1 VCCR:$mask))), + (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen, + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; + + def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; + } } defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; @@ -4353,15 +4921,37 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T, let Inst{7} = Qn{3}; let Inst{0} = 0b1; let validForTailPredication = 1; + let doubleWidthResult = 1; +} + +multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T, + string cstr> { + def "" : MVE_VQDMULL<iname, VTI.Suffix, size, T, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn), (i32 T))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + // Predicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 T), (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + } } -multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> { - def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>; - def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>; +multiclass MVE_VQDMULL_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> { + defm bh : MVE_VQDMULL_m<"vqdmullb", VTI, size, 0b0, cstr>; + defm th : MVE_VQDMULL_m<"vqdmullt", VTI, size, 0b1, cstr>; } -defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; -defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">; +defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<MVE_v8s16, 0b0>; +defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">; // end of mve_qDest_qSrc @@ -4407,10 +4997,61 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]> let Inst{3-0} = Rm{3-0}; } +// Patterns for vector-scalar instructions with integer operands +multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI, + SDNode unpred_op, SDNode pred_op, + bit unpred_has_sign = 0, + bit pred_has_sign = 0> { + defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?)); + defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?)); + + let Predicates = [HasMVEInt] in { + // Unpredicated version + def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val))), + UnpredSign)), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated version + def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val))), + PredSign, + (pred_op (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +// Patterns for vector-scalar instructions with FP operands +multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int, + Instruction instr_f16, + Instruction instr_f32> { + let Predicates = [HasMVEFloat] in { + // Unpredicated F16 + def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>; + // Unpredicated F32 + def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated F16 + def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)), + (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (v8i1 VCCR:$mask), + (v8f16 MQPR:$inactive)))>; + // Predicated F32 + def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)), + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (v4i1 VCCR:$mask), + (v4f32 MQPR:$inactive)))>; + } +} + class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, - bit bit_5, bit bit_12, bit bit_16, - bit bit_28, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, "", pattern> { + bit bit_5, bit bit_12, bit bit_16, bit bit_28> + : MVE_qDest_rSrc<iname, suffix, ""> { let Inst{28} = bit_28; let Inst{21-20} = size; @@ -4421,42 +5062,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix, - bit bit_5, bit bit_12, bit bit_16, - bit bit_28, list<dag> pattern=[]> { - def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00, - bit_5, bit_12, bit_16, bit_28>; - def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01, - bit_5, bit_12, bit_16, bit_28>; - def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10, - bit_5, bit_12, bit_16, bit_28>; -} - -defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>; -defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>; -defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>; - -defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; -defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; -defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} +// Vector-scalar add/sub +multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + unpred_op, pred_int>; +} + +multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; + +multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; + +defm MVE_VADD_qr_i8 : MVE_VADD_qr_m<MVE_v16i8>; +defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>; +defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>; + +defm MVE_VSUB_qr_i8 : MVE_VSUB_qr_m<MVE_v16i8>; +defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>; +defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>; + +// Vector-scalar saturating add/sub +multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode unpred_op_s, SDNode unpred_op_u, + Intrinsic pred_int> { + def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract, + 0b0, VTI.Unsigned>; + defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s); + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + unpred_op, pred_int, 0, 1>; +} + +multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI> + : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat, + int_arm_mve_qadd_predicated>; + +multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI> + : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat, + int_arm_mve_qsub_predicated>; + +defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8>; +defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>; +defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>; +defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8>; +defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>; +defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>; + +defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8>; +defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>; +defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>; +defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8>; +defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>; +defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>; class MVE_VQDMULL_qr<string iname, string suffix, bit size, bit T, string cstr="", list<dag> pattern=[]> @@ -4469,15 +5128,40 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size, let Inst{8} = 0b1; let Inst{5} = 0b1; let validForTailPredication = 1; + let doubleWidthResult = 1; } -multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> { - def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>; - def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>; +multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size, + bit T, string cstr> { + def "" : MVE_VQDMULL_qr<iname, VTI.Suffix, size, T, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val)), + (i32 T))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated + (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val)), + (i32 T), + (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + } } -defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; -defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">; +multiclass MVE_VQDMULL_qr_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> { + defm bh : MVE_VQDMULL_qr_m<"vqdmullb", VTI, size, 0b0, cstr>; + defm th : MVE_VQDMULL_qr_m<"vqdmullt", VTI, size, 0b1, cstr>; +} + +defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>; +defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">; class MVE_VxADDSUB_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit subtract, @@ -4493,19 +5177,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix, let validForTailPredication = 1; } -def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; -def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>; -def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>; -def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>; -def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>; -def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + Intrinsic unpred_int, Intrinsic pred_int> { + def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), + VTI, unpred_int, pred_int, 1, 1>; +} + +multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, + int_arm_mve_hadd_predicated>; + +multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> : + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, + int_arm_mve_hsub_predicated>; -def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>; -def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>; -def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>; -def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>; -def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>; -def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8>; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8>; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>; + +defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8>; +defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>; +defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>; +defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>; +defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>; +defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>; let Predicates = [HasMVEFloat] in { def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>; @@ -4515,6 +5214,11 @@ let Predicates = [HasMVEFloat] in { def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>; } +defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated, + MVE_VADD_qr_f16, MVE_VADD_qr_f32>; +defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated, + MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>; + class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, bit bit_7, bit bit_17, list<dag> pattern=[]> : MVE_qDest_single_rSrc<iname, suffix, pattern> { @@ -4563,19 +5267,19 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; - def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; } class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> @@ -4594,6 +5298,20 @@ def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; +multiclass MVE_VBRSR_pat_m<MVEVectorVTInfo VTI, Instruction Inst> { + // Unpredicated + def : Pat<(VTI.Vec (int_arm_mve_vbrsr (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm)))>; + // Predicated + def : Pat<(VTI.Vec (int_arm_mve_vbrsr_predicated + (VTI.Vec MQPR:$inactive), + (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + let Predicates = [HasMVEInt] in { def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))), (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>; @@ -4603,11 +5321,19 @@ let Predicates = [HasMVEInt] in { def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))), (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; + + defm : MVE_VBRSR_pat_m<MVE_v16i8, MVE_VBRSR8>; + defm : MVE_VBRSR_pat_m<MVE_v8i16, MVE_VBRSR16>; + defm : MVE_VBRSR_pat_m<MVE_v4i32, MVE_VBRSR32>; } -class MVE_VMUL_qr_int<string iname, string suffix, - bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, "", pattern> { +let Predicates = [HasMVEFloat] in { + defm : MVE_VBRSR_pat_m<MVE_v8f16, MVE_VBRSR16>; + defm : MVE_VBRSR_pat_m<MVE_v4f32, MVE_VBRSR32>; +} + +class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size> + : MVE_qDest_rSrc<iname, suffix, ""> { let Inst{28} = 0b0; let Inst{21-20} = size; @@ -4618,19 +5344,16 @@ class MVE_VMUL_qr_int<string iname, string suffix, let validForTailPredication = 1; } -def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; -def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; -def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> { + def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + mul, int_arm_mve_mul_predicated>; } +defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>; +defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>; +defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>; + class MVE_VxxMUL_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]> : MVE_qDest_rSrc<iname, suffix, "", pattern> { @@ -4643,19 +5366,37 @@ class MVE_VxxMUL_qr<string iname, string suffix, let Inst{5} = 0b1; } -def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>; -def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>; -def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>; +multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28, + Intrinsic int_unpred, Intrinsic int_pred> { + def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + int_unpred, int_pred>; +} + +multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> : + MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, + int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>; + +multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> : + MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, + int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>; -def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; -def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; -def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; +defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>; +defm MVE_VQDMULH_qr_s16 : MVE_VQDMULH_qr_m<MVE_v8s16>; +defm MVE_VQDMULH_qr_s32 : MVE_VQDMULH_qr_m<MVE_v4s32>; + +defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>; +defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>; +defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>; let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } +defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated, + MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>; + class MVE_VFMAMLA_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit S, list<dag> pattern=[]> @@ -4668,42 +5409,87 @@ class MVE_VFMAMLA_qr<string iname, string suffix, let Inst{8} = 0b0; let Inst{5} = 0b0; let validForTailPredication = 1; + let hasSideEffects = 0; } -def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; -def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>; -def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>; -def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>; -def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>; -def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI, + bit scalar_addend> { + def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, + scalar_addend>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated"); + defvar v1 = (VTI.Vec MQPR:$v1); + defvar v2 = (VTI.Vec MQPR:$v2); + defvar vs = (VTI.Vec (ARMvdup rGPR:$s)); + defvar s = (i32 rGPR:$s); + defvar pred = (VTI.Pred VCCR:$pred); + + // The signed and unsigned variants of this instruction have different + // encodings, but they're functionally identical. For the sake of + // determinism, we generate only the unsigned variant. + if VTI.Unsigned then let Predicates = [HasMVEInt] in { + if scalar_addend then { + def : Pat<(VTI.Vec (add (mul v1, v2), vs)), + (VTI.Vec (Inst v1, v2, s))>; + } else { + def : Pat<(VTI.Vec (add (mul v2, vs), v1)), + (VTI.Vec (Inst v1, v2, s))>; + } -def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>; -def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>; -def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>; -def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; -def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; -def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; + def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)), + (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>; + } +} -let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (add (v4i32 MQPR:$src1), - (v4i32 (mul (v4i32 MQPR:$src2), - (v4i32 (ARMvdup (i32 rGPR:$x))))))), - (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$src1), - (v8i16 (mul (v8i16 MQPR:$src2), - (v8i16 (ARMvdup (i32 rGPR:$x))))))), - (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; - def : Pat<(v16i8 (add (v16i8 MQPR:$src1), - (v16i8 (mul (v16i8 MQPR:$src2), - (v16i8 (ARMvdup (i32 rGPR:$x))))))), - (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; +defm MVE_VMLA_qr_s8 : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>; +defm MVE_VMLA_qr_s16 : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>; +defm MVE_VMLA_qr_s32 : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>; +defm MVE_VMLA_qr_u8 : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>; +defm MVE_VMLA_qr_u16 : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>; +defm MVE_VMLA_qr_u32 : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>; + +defm MVE_VMLAS_qr_s8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>; +defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>; +defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>; +defm MVE_VMLAS_qr_u8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>; +defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>; +defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>; + +multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI, + bit scalar_addend> { + def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = int_arm_mve_fma_predicated; + defvar v1 = (VTI.Vec MQPR:$v1); + defvar v2 = (VTI.Vec MQPR:$v2); + defvar vs = (VTI.Vec (ARMvdup (i32 rGPR:$s))); + defvar is = (i32 rGPR:$s); + defvar pred = (VTI.Pred VCCR:$pred); + + let Predicates = [HasMVEFloat] in { + if scalar_addend then { + def : Pat<(VTI.Vec (fma v1, v2, vs)), + (VTI.Vec (Inst v1, v2, is))>; + def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)), + (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>; + } else { + def : Pat<(VTI.Vec (fma v1, vs, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma vs, v1, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)), + (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; + def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)), + (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; + } + } } let Predicates = [HasMVEFloat] in { - def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; - def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; - def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>; - def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>; + defm MVE_VFMA_qr_f16 : MVE_VFMA_qr_multi<"vfma", MVE_v8f16, 0>; + defm MVE_VFMA_qr_f32 : MVE_VFMA_qr_multi<"vfma", MVE_v4f32, 0>; + defm MVE_VFMA_qr_Sf16 : MVE_VFMA_qr_multi<"vfmas", MVE_v8f16, 1>; + defm MVE_VFMA_qr_Sf32 : MVE_VFMA_qr_multi<"vfmas", MVE_v4f32, 1>; } class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size, @@ -4718,10 +5504,30 @@ class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size, let Inst{5} = bit_5; } +multiclass MVE_VQDMLAH_qr_multi<string iname, MVEVectorVTInfo VTI, + bit bit_5, bit bit_12> { + def "": MVE_VQDMLAH_qr<iname, VTI.Suffix, 0b0, VTI.Size, bit_5, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # iname); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_predicated"); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), (VTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + } +} + multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> { - def s8 : MVE_VQDMLAH_qr<iname, "s8", 0b0, 0b00, bit_5, bit_12>; - def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>; - def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>; + defm s8 : MVE_VQDMLAH_qr_multi<iname, MVE_v16s8, bit_5, bit_12>; + defm s16 : MVE_VQDMLAH_qr_multi<iname, MVE_v8s16, bit_5, bit_12>; + defm s32 : MVE_VQDMLAH_qr_multi<iname, MVE_v4s32, bit_5, bit_12>; } defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>; @@ -4752,6 +5558,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{6-1} = 0b110111; let Inst{0} = imm{0}; let validForTailPredication = 1; + let hasSideEffects = 0; } def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; @@ -4787,6 +5594,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{3-1} = Rm{3-1}; let Inst{0} = imm{0}; let validForTailPredication = 1; + let hasSideEffects = 0; } def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; @@ -4855,6 +5663,8 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr> let Inst{12-5} = 0b01111000; let Inst{4} = idx2; let Inst{3-0} = Rt{3-0}; + + let hasSideEffects = 0; } // The assembly syntax for these instructions mentions the vector @@ -4924,6 +5734,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size, let mayLoad = load; let mayStore = !eq(load,0); + let hasSideEffects = 0; } // A parameter class used to encapsulate all the ways the writeback @@ -5004,22 +5815,44 @@ foreach wb = [MVE_vldst24_writeback< "vst" # n.nvecs # stage # "." # s.lanesize>; } +def SDTARMVST2 : SDTypeProfile<1, 5, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>; +def SDTARMVST4 : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>, + SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>; +def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>; +def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>; + multiclass MVE_vst24_patterns<int lanesize, ValueType VT> { foreach stage = [0,1] in def : Pat<(int_arm_mve_vst2q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1] in + def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))), + (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr))>; foreach stage = [0,1,2,3] in def : Pat<(int_arm_mve_vst4q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), - (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1,2,3] in + def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))), + (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr))>; } defm : MVE_vst24_patterns<8, v16i8>; defm : MVE_vst24_patterns<16, v8i16>; @@ -5097,6 +5930,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc, let mayLoad = dir.load; let mayStore = !eq(dir.load,0); + let hasSideEffects = 0; let validForTailPredication = 1; } |