aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AArch64/AArch64InstrInfo.td
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AArch64/AArch64InstrInfo.td')
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td253
1 files changed, 219 insertions, 34 deletions
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eed53f36d574..1981bd5d3bf0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -62,6 +62,9 @@ def HasAM : Predicate<"Subtarget->hasAM()">,
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
AssemblerPredicate<"FeatureSEL2", "sel2">;
+def HasPMU : Predicate<"Subtarget->hasPMU()">,
+ AssemblerPredicate<"FeaturePMU", "pmu">;
+
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
@@ -116,7 +119,7 @@ def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+ AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<"FeatureRCPC", "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -133,6 +136,12 @@ def HasBTI : Predicate<"Subtarget->hasBTI()">,
AssemblerPredicate<"FeatureBranchTargetId", "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
AssemblerPredicate<"FeatureMTE", "mte">;
+def HasTME : Predicate<"Subtarget->hasTME()">,
+ AssemblerPredicate<"FeatureTME", "tme">;
+def HasETE : Predicate<"Subtarget->hasETE()">,
+ AssemblerPredicate<"FeatureETE", "ete">;
+def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
+ AssemblerPredicate<"FeatureTRBE", "trbe">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S
def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def SDT_AArch64unpk : SDTypeProfile<1, 1, [
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;
+def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
+def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
+def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in {
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+
+ // Toggles patterns which aren't beneficial in GlobalISel when we aren't
+ // optimizing. This allows us to selectively use patterns without impacting
+ // SelectionDAG's behaviour.
+ // FIXME: One day there will probably be a nicer way to check for this, but
+ // today is not that day.
+ def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">;
}
include "AArch64InstrFormats.td"
@@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
def HWASAN_CHECK_MEMACCESS : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
- [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
+ [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+ Sched<[]>;
+def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
+ (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
+ [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
Sched<[]>;
}
@@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+let Predicates = [HasTME] in {
+
+def TSTART : TMSystemI<0b0000, "tstart",
+ [(set GPR64:$Rt, (int_aarch64_tstart))]>;
+
+def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>;
+
+def TCANCEL : TMSystemException<0b011, "tcancel",
+ [(int_aarch64_tcancel i64_imm0_65535:$imm)]>;
+
+def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> {
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+} // HasTME
+
//===----------------------------------------------------------------------===//
// Move immediate instructions.
//===----------------------------------------------------------------------===//
@@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in
defm MOVZ : MoveImmediate<0b10, "movz">;
// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>;
// Final group of aliases covers true "mov $Rd, $imm" cases.
multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
@@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{
def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
GISDNodeXFormEquiv<trunc_imm>;
+let Predicates = [OptimizedGISelOrOtherSelector] in {
+// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless
+// copies.
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+}
// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
@@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
let AddedComplexity = 1 in {
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
- (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
- (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
+ (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
+ (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
}
// Because of the immediate format for add/sub-imm instructions, the
@@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
const DataLayout &DL = MF->getDataLayout();
- unsigned Align = G->getGlobal()->getPointerAlignment(DL);
- return Align >= 4 && G->getOffset() % 4 == 0;
+ MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
+ return Align && *Align >= 4 && G->getOffset() % 4 == 0;
}
if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
@@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
// the NEON variant.
+
+// Here we handle first -(a + b*c) for FNMADD:
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)),
+ (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
+// Now it's time for "(-a) + (-b)*c"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+// And here "(-a) + b*(-c)"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
@@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
+// Extracting lane zero is a special case where we can just use a plain
+// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the
+// rest of the compiler, especially the register allocator and copy propagation,
+// to reason about, so is preferred when it's possible to use it.
+let AddedComplexity = 10 in {
+ def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>;
+ def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>;
+ def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>;
+}
+
+// dot_v4i8
+class mul_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm, node:$offset),
+ (mul (ldop (add node:$Rn, node:$offset)),
+ (ldop (add node:$Rm, node:$offset)))>;
+class mulz_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm),
+ (mul (ldop node:$Rn), (ldop node:$Rm))>;
+
+def load_v4i8 :
+ OutPatFrag<(ops node:$R),
+ (INSERT_SUBREG
+ (v2i32 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)),
+ ssub)>;
+
+class dot_v4i8<Instruction DOT, SDPatternOperator ldop> :
+ Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)),
+ (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))),
+ (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR),
+ (load_v4i8 GPR64sp:$Rn),
+ (load_v4i8 GPR64sp:$Rm))),
+ sub_32)>, Requires<[HasDotProd]>;
+
+// dot_v8i8
+class ee_v8i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K),
+ (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>;
+
+class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K),
+ (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)),
+ (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>;
+
+class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)),
+ (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))),
+ (i64 0)))>;
+
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>;
+
+class odot_v8i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (EXTRACT_SUBREG
+ (VADDV_32
+ (i64 (DOT (DUPv2i32gpr WZR),
+ (v8i8 node:$Vm),
+ (v8i8 node:$Vn)))),
+ sub_32)>;
+
+class dot_v8i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn),
+ (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>,
+ Requires<[HasDotProd]>;
+
+// dot_v16i8
+class ee_v16i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K1, node:$K2),
+ (v4i16 (extract_subvector
+ (v8i16 (extend
+ (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>;
+
+class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2),
+ (v4i32
+ (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)),
+ (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>;
+
+class idot_v16i8<SDPatternOperator m, SDPatternOperator x> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))),
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))),
+ (i64 0)))>;
+
+class odot_v16i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (i32 (ADDVv4i32v
+ (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>;
+
+class dot_v16i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn),
+ (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>,
+ Requires<[HasDotProd]>;
+
+let AddedComplexity = 10 in {
+ def : dot_v4i8<SDOTv8i8, sextloadi8>;
+ def : dot_v4i8<UDOTv8i8, zextloadi8>;
+ def : dot_v8i8<SDOTv8i8, AArch64smull, sext>;
+ def : dot_v8i8<UDOTv8i8, AArch64umull, zext>;
+ def : dot_v16i8<SDOTv16i8, AArch64smull, sext>;
+ def : dot_v16i8<UDOTv16i8, AArch64umull, zext>;
+
+ // FIXME: add patterns to generate vector by element dot product.
+ // FIXME: add SVE dot-product patterns.
+}
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"