aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-07-03 14:10:23 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-07-03 14:10:23 +0000
commit145449b1e420787bb99721a429341fa6be3adfb6 (patch)
tree1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Target/AArch64
parentecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff)
Diffstat (limited to 'llvm/lib/Target/AArch64')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td113
-rw-r--r--llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp49
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64CollectLOH.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp34
-rw-r--r--llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp1098
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h19
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp294
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp4023
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h83
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrAtomics.td37
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td257
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp608
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h56
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td876
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp201
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp236
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp49
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h30
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp82
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineScheduler.h33
-rw-r--r--llvm/lib/Target/AArch64/AArch64MacroFusion.cpp15
-rw-r--r--llvm/lib/Target/AArch64/AArch64PerfectShuffle.h13169
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp114
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.h5
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td51
-rw-r--r--llvm/lib/Target/AArch64/AArch64SLSHardening.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td73
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td583
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA55.td127
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA64FX.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedAmpere1.td1136
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td25
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredExynos.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredicates.td149
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedTSV110.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp34
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp203
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp37
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h365
-rw-r--r--llvm/lib/Target/AArch64/AArch64SystemOperands.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp36
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp383
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h24
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp134
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp580
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h8
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp6
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h3
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp38
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp590
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp12
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp6
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp8
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp9
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp26
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp7
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp38
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h11
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp4
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp31
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td538
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td378
-rw-r--r--llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp10
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h3
81 files changed, 16165 insertions, 11071 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 4d1464901777..a6065d4ed9ec 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -16,6 +16,8 @@
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "Utils/AArch64BaseInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Target/TargetMachine.h"
@@ -71,6 +73,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
void initializeAArch64BranchTargetsPass(PassRegistry&);
+void initializeAArch64CFIFixupPass(PassRegistry&);
void initializeAArch64CollectLOHPass(PassRegistry&);
void initializeAArch64CondBrTuningPass(PassRegistry &);
void initializeAArch64CompressJumpTablesPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 9a04b28a8b8f..f092c039b58e 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -64,6 +64,10 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
"Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
+def FeatureLDAPR : SubtargetFeature<"ldapr", "HasLDAPR", "true",
+ "Use LDAPR to lower atomic loads; experimental until we "
+ "have more testing/a formal correctness proof">;
+
def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
"Enable out of line atomics to support LSE instructions">;
@@ -154,6 +158,10 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
+// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
+// as movi is more efficient across all cores. Newer cores can eliminate
+// fmovs early and there is no difference with movi, but this not true for
+// all implementations.
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
"Has no zero-cycle zeroing instructions for FP registers">;
@@ -168,7 +176,7 @@ def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround",
"The zero-cycle floating-point zeroing instruction has a bug">;
def FeatureStrictAlign : SubtargetFeature<"strict-align",
- "StrictAlign", "true",
+ "RequiresStrictAlign", "true",
"Disallow all unaligned memory "
"access">;
@@ -190,11 +198,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<
"Prefer likely predicted branches over selects">;
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
- "CustomAsCheapAsMove", "true",
+ "HasCustomCheapAsMoveHandling", "true",
"Use custom handling of cheap instructions">;
def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
- "ExynosAsCheapAsMove", "true",
+ "HasExynosCheapAsMoveHandling", "true",
"Use Exynos specific handling of cheap instructions",
[FeatureCustomCheapAsMoveHandling]>;
@@ -202,12 +210,16 @@ def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
- "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+ "IsMisaligned128StoreSlow", "true", "Misaligned 128 bit stores are slow">;
def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
- "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;
+ "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">;
+
+def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address",
+ "IsStoreAddressAscend", "false",
+ "Schedule vector stores by ascending address">;
-def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow",
+def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow",
"true", "STR of Q register with register offset is slow">;
def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
@@ -246,6 +258,10 @@ def FeatureFuseCryptoEOR : SubtargetFeature<
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
"CPU fuses AES/PMULL and EOR operations">;
+def FeatureFuseAdrpAdd : SubtargetFeature<
+ "fuse-adrp-add", "HasFuseAdrpAdd", "true",
+ "CPU fuses adrp+add operations">;
+
def FeatureFuseLiterals : SubtargetFeature<
"fuse-literals", "HasFuseLiterals", "true",
"CPU fuses literal generation operations">;
@@ -438,13 +454,8 @@ def FeatureEnhancedCounterVirtualization :
def FeatureRME : SubtargetFeature<"rme", "HasRME",
"true", "Enable Realm Management Extension">;
-// A subset of SVE(2) instructions are legal in Streaming SVE execution mode
-// defined by SME.
-def FeatureStreamingSVE : SubtargetFeature<"streaming-sve",
- "HasStreamingSVE", "true",
- "Enable subset of SVE(2) instructions for Streaming SVE execution mode">;
def FeatureSME : SubtargetFeature<"sme", "HasSME", "true",
- "Enable Scalable Matrix Extension (SME)", [FeatureStreamingSVE, FeatureBF16]>;
+ "Enable Scalable Matrix Extension (SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true",
"Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>;
@@ -464,6 +475,11 @@ def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769",
"FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">;
+def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
+ "NoBTIAtReturnTwice", "true",
+ "Don't place a BTI instruction "
+ "after a return-twice">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -534,7 +550,18 @@ def HasV8_0rOps : SubtargetFeature<
FeaturePAuth, FeatureRCPC,
//v8.4
FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI,
- FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>;
+ FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+ // Not mandatory in v8.0-R, but included here on the grounds that it
+ // only enables names of system registers
+ FeatureSpecRestrict
+ ]>;
+
+// Only intended to be used by disassemblers.
+def FeatureAll
+ : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions", []>;
+
+class AssemblerPredicateWithAll<dag cond, string name="">
+ : AssemblerPredicate<(any_of FeatureAll, cond), name>;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -552,6 +579,7 @@ include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
include "AArch64SchedPredicates.td"
include "AArch64SchedPredExynos.td"
+include "AArch64SchedPredAmpere.td"
include "AArch64Combine.td"
def AArch64InstrInfo : InstrInfo;
@@ -596,7 +624,7 @@ class AArch64Unsupported { list<Predicate> F; }
def SVEUnsupported : AArch64Unsupported {
let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
- HasSVE2BitPerm, HasSVEorStreamingSVE, HasSVE2orStreamingSVE];
+ HasSVE2BitPerm, HasSVEorSME, HasSVE2orSME];
}
def PAUnsupported : AArch64Unsupported {
@@ -621,6 +649,7 @@ include "AArch64SchedThunderX2T99.td"
include "AArch64SchedA64FX.td"
include "AArch64SchedThunderX3T110.td"
include "AArch64SchedTSV110.td"
+include "AArch64SchedAmpere1.td"
def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors">;
@@ -649,6 +678,7 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureFuseAES,
FeatureBalanceFPOps,
FeatureCustomCheapAsMoveHandling,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive]>;
@@ -657,11 +687,13 @@ def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
"Cortex-A65 ARM processors", [
FeatureFuseAES,
FeatureFuseAddress,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals]>;
def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
"Cortex-A72 ARM processors", [
FeatureFuseAES,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals]>;
def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
@@ -802,6 +834,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseArithmeticLogic,
FeatureFuseCCSelect,
FeatureFuseCryptoEOR,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureZCRegMove,
FeatureZCZeroing]>;
@@ -813,13 +846,15 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseCCSelect,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureLSLFast,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive]>;
-def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
- "Samsung Exynos-M3 processors",
+// Re-uses some scheduling and tunings from the ExynosM3 proc family.
+def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
+ "Samsung Exynos-M4 processors",
[FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureExynosCheapAsMoveHandling,
@@ -828,6 +863,7 @@ def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCCSelect,
+ FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureLSLFast,
FeaturePostRAScheduler,
@@ -934,6 +970,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
FeatureFuseAES,
FeaturePostRAScheduler]>;
+def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+ "Ampere Computing Ampere-1 processors", [
+ FeaturePostRAScheduler,
+ FeatureFuseAES,
+ FeatureLSLFast,
+ FeatureAggressiveFMA,
+ FeatureArithmeticBccFusion,
+ FeatureCmpBccFusion,
+ FeatureFuseAddress,
+ FeatureFuseLiterals]>;
def ProcessorFeatures {
list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -947,13 +993,14 @@ def ProcessorFeatures {
FeatureFP16FML];
list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
- FeatureRCPC, FeatureSSBS, FeatureRAS];
+ FeatureRCPC, FeatureSSBS, FeatureRAS,
+ FeaturePerfMon];
list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
- FeatureRCPC, FeatureSSBS];
+ FeatureRCPC, FeatureSSBS, FeaturePerfMon];
list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
- FeatureRCPC];
+ FeatureRCPC, FeaturePerfMon, FeatureSSBS];
list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeaturePerfMon, FeatureSPE,
@@ -968,14 +1015,15 @@ def ProcessorFeatures {
FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
list<SubtargetFeature> R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
FeatureFP16FML, FeatureSSBS, FeaturePredRes,
- FeatureSB, FeatureSpecRestrict];
+ FeatureSB];
list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureRCPC, FeaturePerfMon,
- FeatureSPE, FeatureFullFP16, FeatureDotProd];
+ FeatureSPE, FeatureFullFP16, FeatureDotProd,
+ FeatureSSBS];
list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureRCPC, FeaturePerfMon,
FeatureSPE, FeatureFullFP16, FeatureDotProd,
- FeaturePAuth];
+ FeaturePAuth, FeatureSSBS];
list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
FeatureMatMulInt8, FeatureBF16, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
@@ -1012,13 +1060,15 @@ def ProcessorFeatures {
FeatureRDM];
list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
- FeatureRCPC, FeatureSSBS];
+ FeatureRCPC, FeatureSSBS, FeaturePerfMon];
list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
- FeatureRCPC, FeatureSPE, FeatureSSBS];
+ FeatureRCPC, FeatureSPE, FeatureSSBS,
+ FeaturePerfMon];
list<SubtargetFeature> NeoverseN2 = [HasV8_5aOps, FeatureBF16, FeatureETE,
FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
- FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto];
+ FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto,
+ FeaturePerfMon];
list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
@@ -1041,17 +1091,20 @@ def ProcessorFeatures {
list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSPE,
FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+ list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+ FeatureMTE, FeatureSSBS];
// ETE and TRBE are future architecture extensions. We temporarily enable them
// by default for users targeting generic AArch64. The extensions do not
// affect code generated by the compiler and can be used only by explicitly
// mentioning the new system register names in assembly.
- list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureETE];
+ list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeatureETE];
}
-
+// FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
+// optimizations.
def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
- [FeatureFuseAES, FeaturePostRAScheduler]>;
+ [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>;
def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
[TuneA35]>;
def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
@@ -1178,6 +1231,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
[TuneCarmel]>;
+// Ampere Computing
+def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
+ [TuneAmpere1]>;
+
//===----------------------------------------------------------------------===//
// Assembly parser
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 4cdf5f144437..37a65b64a885 100644
--- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -223,6 +223,7 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
if (isFirstInstructionInSequence(PrevInstr) &&
isSecondInstructionInSequence(CurrInstr)) {
LLVM_DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n");
+ (void) Idx;
Sequences.push_back(CurrInstr);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index b54a0eaba7d1..ef4860979dd3 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -132,7 +132,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
AArch64FI = MF.getInfo<AArch64FunctionInfo>();
- STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget());
+ STI = &MF.getSubtarget<AArch64Subtarget>();
SetupMachineFunction(MF);
@@ -143,10 +143,10 @@ public:
int Type =
COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
- OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
- OutStreamer->EmitCOFFSymbolStorageClass(Scl);
- OutStreamer->EmitCOFFSymbolType(Type);
- OutStreamer->EndCOFFSymbolDef();
+ OutStreamer->beginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->emitCOFFSymbolStorageClass(Scl);
+ OutStreamer->emitCOFFSymbolType(Type);
+ OutStreamer->endCOFFSymbolDef();
}
// Emit the rest of the function body.
@@ -204,10 +204,10 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
// Emit an absolute @feat.00 symbol. This appears to be some kind of
// compiler features bitfield read by link.exe.
MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
- OutStreamer->BeginCOFFSymbolDef(S);
- OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
- OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
- OutStreamer->EndCOFFSymbolDef();
+ OutStreamer->beginCOFFSymbolDef(S);
+ OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->endCOFFSymbolDef();
int64_t Feat00Flags = 0;
if (M.getModuleFlag("cfguard")) {
@@ -251,7 +251,7 @@ void AArch64AsmPrinter::emitFunctionHeaderComment() {
const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>();
Optional<std::string> OutlinerString = FI->getOutliningStyle();
if (OutlinerString != None)
- OutStreamer->GetCommentOS() << ' ' << OutlinerString;
+ OutStreamer->getCommentOS() << ' ' << OutlinerString;
}
void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
@@ -378,10 +378,10 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
bool CompileKernel =
(AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1;
- OutStreamer->SwitchSection(OutContext.getELFSection(
+ OutStreamer->switchSection(OutContext.getELFSection(
".text.hot", ELF::SHT_PROGBITS,
- ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
- Sym->getName(), /*IsComdat=*/true));
+ ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(),
+ /*IsComdat=*/true));
OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
@@ -827,7 +827,7 @@ void AArch64AsmPrinter::emitJumpTableInfo() {
const TargetLoweringObjectFile &TLOF = getObjFileLowering();
MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
- OutStreamer->SwitchSection(ReadOnlySec);
+ OutStreamer->switchSection(ReadOnlySec);
auto AFI = MF->getInfo<AArch64FunctionInfo>();
for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
@@ -865,7 +865,7 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall ||
MF->getFunction().getCallingConv() ==
CallingConv::AArch64_SVE_VectorCall ||
- STI->getRegisterInfo()->hasSVEArgsOrReturn(MF)) {
+ MF->getInfo<AArch64FunctionInfo>()->isSVECC()) {
auto *TS =
static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
TS->emitDirectiveVariantPCS(CurrentFnSym);
@@ -1129,7 +1129,8 @@ void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
- if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
+ if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
+ STI->hasNEON()) {
// Convert H/S register to corresponding D register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
@@ -1262,7 +1263,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
case AArch64::DBG_VALUE:
- case AArch64::DBG_VALUE_LIST: {
+ case AArch64::DBG_VALUE_LIST:
if (isVerbose() && OutStreamer->hasRawTextSupport()) {
SmallString<128> TmpStr;
raw_svector_ostream OS(TmpStr);
@@ -1282,8 +1283,18 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
OutStreamer->emitCFIBKeyFrame();
return;
- }
- }
+ }
+
+ case AArch64::EMITMTETAGGED: {
+ ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
+ if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
+ ExceptionHandlingType != ExceptionHandling::ARM)
+ return;
+
+ if (getFunctionCFISectionType(*MF) != CFISection::None)
+ OutStreamer->emitCFIMTETaggedFrame();
+ return;
+ }
// Tail calls use pseudo instructions so they have the proper code-gen
// attributes (isCall, isReturn, etc.). We lower them to the real
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index f26151536a58..c0da242a26de 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -82,9 +82,9 @@ def CC_AArch64_AAPCS : CallingConv<[
nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
CCPassIndirect<i64>>,
- CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
CCAssignToReg<[P0, P1, P2, P3]>>,
- CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
CCPassIndirect<i64>>,
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -149,7 +149,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
- CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1],
CCAssignToReg<[P0, P1, P2, P3]>>
]>;
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index ac243347b24d..d12689970dc5 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -528,10 +528,8 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
// count as MultiUser or block optimization. This is especially important on
// arm64_32, where any memory operation is likely to be an explicit use of
// xN and an implicit use of wN (the base address register).
- if (!UsesSeen.count(Idx)) {
+ if (UsesSeen.insert(Idx).second)
handleUse(MI, MO, LOHInfos[Idx]);
- UsesSeen.insert(Idx);
- }
}
}
@@ -559,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
// Walk the basic block backwards and update the per register state machine
// in the process.
for (const MachineInstr &MI :
- instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
+ instructionsWithoutDebug(MBB.instr_rbegin(), MBB.instr_rend())) {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
case AArch64::ADDXri:
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1994e0eb7fb9..18c111255e53 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -217,7 +217,7 @@ def AArch64PostLegalizerLoweringHelper
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombinerHelper
: GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
- [copy_prop, erase_undef_store, combines_for_extload,
+ [copy_prop, combines_for_extload,
sext_trunc_sextload, mutate_anyext_to_zext,
hoist_logic_op_with_same_opcode_hands,
redundant_and, xor_of_and_with_same_reg,
@@ -228,6 +228,6 @@ def AArch64PostLegalizerCombinerHelper
select_combines, fold_merge_to_zext,
constant_fold, identity_combines,
ptr_add_immed_chain, overlapping_and,
- split_store_zero_128]> {
+ split_store_zero_128, undef_combines]> {
let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
}
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 82e8df3b73f9..343f888b7552 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -247,8 +247,8 @@ void SSACCmpConv::updateTailPHIs() {
for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
// PHI operands are (Reg, MBB) at (oi-2, oi-1).
if (I.getOperand(oi - 1).getMBB() == CmpBB) {
- I.RemoveOperand(oi - 1);
- I.RemoveOperand(oi - 2);
+ I.removeOperand(oi - 1);
+ I.removeOperand(oi - 2);
}
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b0f739cc26e6..910f8cdede75 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -86,6 +86,7 @@ private:
unsigned N);
bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
+ bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI);
};
@@ -759,6 +760,37 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
return true;
}
+bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ // Expand CALL_BTI pseudo to:
+ // - a branch to the call target
+ // - a BTI instruction
+ // Mark the sequence as a bundle, to avoid passes moving other code in
+ // between.
+
+ MachineInstr &MI = *MBBI;
+ MachineOperand &CallTarget = MI.getOperand(0);
+ assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+ "invalid operand for regular call");
+ unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+ MachineInstr *Call =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
+ Call->addOperand(CallTarget);
+
+ MachineInstr *BTI =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT))
+ // BTI J so that setjmp can to BR to this.
+ .addImm(36)
+ .getInstr();
+
+ if (MI.shouldUpdateCallSiteInfo())
+ MBB.getParent()->moveCallSiteInfo(&MI, Call);
+
+ MI.eraseFromParent();
+ finalizeBundle(MBB, Call->getIterator(), std::next(BTI->getIterator()));
+ return true;
+}
+
bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
Register CtxReg = MBBI->getOperand(0).getReg();
@@ -1238,6 +1270,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
case AArch64::BLR_RVMARKER:
return expandCALL_RVMARKER(MBB, MBBI);
+ case AArch64::BLR_BTI:
+ return expandCALL_BTI(MBB, MBBI);
case AArch64::StoreSwiftAsyncContext:
return expandStoreSwiftAsyncContext(MBB, MBBI);
}
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 793663ef97d7..6de374125466 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -813,7 +813,7 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
}
bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
- auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ auto &ST = Fn.getSubtarget<AArch64Subtarget>();
if (ST.getProcFamily() != AArch64Subtarget::Falkor)
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index c67fa62c7a92..49fffa01a974 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,6 +14,7 @@
#include "AArch64.h"
#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -282,8 +283,7 @@ public:
explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo)
: FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
- Subtarget =
- &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
+ Subtarget = &FuncInfo.MF->getSubtarget<AArch64Subtarget>();
Context = &FuncInfo.Fn->getContext();
}
@@ -3127,6 +3127,13 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (!Callee && !Symbol)
return false;
+ // Allow SelectionDAG isel to handle calls to functions like setjmp that need
+ // a bti instruction following the call.
+ if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
+ !Subtarget->noBTIAtReturnTwice() &&
+ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
+ return false;
+
// Allow SelectionDAG isel to handle tail calls.
if (IsTailCall)
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a4d20735e2b1..78babdf9f1f0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -117,6 +117,72 @@
//
// FIXME: also explain the redzone concept.
//
+// An example of the prologue:
+//
+// .globl __foo
+// .align 2
+// __foo:
+// Ltmp0:
+// .cfi_startproc
+// .cfi_personality 155, ___gxx_personality_v0
+// Leh_func_begin:
+// .cfi_lsda 16, Lexception33
+//
+// stp xa,bx, [sp, -#offset]!
+// ...
+// stp x28, x27, [sp, #offset-32]
+// stp fp, lr, [sp, #offset-16]
+// add fp, sp, #offset - 16
+// sub sp, sp, #1360
+//
+// The Stack:
+// +-------------------------------------------+
+// 10000 | ........ | ........ | ........ | ........ |
+// 10004 | ........ | ........ | ........ | ........ |
+// +-------------------------------------------+
+// 10008 | ........ | ........ | ........ | ........ |
+// 1000c | ........ | ........ | ........ | ........ |
+// +===========================================+
+// 10010 | X28 Register |
+// 10014 | X28 Register |
+// +-------------------------------------------+
+// 10018 | X27 Register |
+// 1001c | X27 Register |
+// +===========================================+
+// 10020 | Frame Pointer |
+// 10024 | Frame Pointer |
+// +-------------------------------------------+
+// 10028 | Link Register |
+// 1002c | Link Register |
+// +===========================================+
+// 10030 | ........ | ........ | ........ | ........ |
+// 10034 | ........ | ........ | ........ | ........ |
+// +-------------------------------------------+
+// 10038 | ........ | ........ | ........ | ........ |
+// 1003c | ........ | ........ | ........ | ........ |
+// +-------------------------------------------+
+//
+// [sp] = 10030 :: >>initial value<<
+// sp = 10020 :: stp fp, lr, [sp, #-16]!
+// fp = sp == 10020 :: mov fp, sp
+// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
+// sp == 10010 :: >>final value<<
+//
+// The frame pointer (w29) points to address 10020. If we use an offset of
+// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+// for w27, and -32 for w28:
+//
+// Ltmp1:
+// .cfi_def_cfa w29, 16
+// Ltmp2:
+// .cfi_offset w30, -8
+// Ltmp3:
+// .cfi_offset w29, -16
+// Ltmp4:
+// .cfi_offset w27, -24
+// Ltmp5:
+// .cfi_offset w28, -32
+//
//===----------------------------------------------------------------------===//
#include "AArch64FrameLowering.h"
@@ -126,6 +192,7 @@
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -154,7 +221,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -187,7 +253,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
cl::init(true), cl::Hidden);
cl::opt<bool> EnableHomogeneousPrologEpilog(
- "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
+ "homogeneous-prolog-epilog", cl::Hidden,
cl::desc("Emit homogeneous prologue and epilogue for the size "
"optimization (default = off)"));
@@ -233,6 +299,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
static bool produceCompactUnwindFrame(MachineFunction &MF);
static bool needsWinCFI(const MachineFunction &MF);
static StackOffset getSVEStackSize(const MachineFunction &MF);
+static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
/// Returns true if a homogeneous prolog or epilog code can be emitted
/// for the size optimization. If possible, a frame helper call is injected.
@@ -440,137 +507,309 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(I);
}
-// Convenience function to create a DWARF expression for
-// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
-static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
- int NumBytes, int NumVGScaledBytes, unsigned VG,
- llvm::raw_string_ostream &Comment) {
- uint8_t buffer[16];
+void AArch64FrameLowering::emitCalleeSavedGPRLocations(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
- if (NumBytes) {
- Expr.push_back(dwarf::DW_OP_consts);
- Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
- Expr.push_back((uint8_t)dwarf::DW_OP_plus);
- Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
- }
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
- if (NumVGScaledBytes) {
- Expr.push_back((uint8_t)dwarf::DW_OP_consts);
- Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
- Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
- Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
- Expr.push_back(0);
+ for (const auto &Info : CSI) {
+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ continue;
- Expr.push_back((uint8_t)dwarf::DW_OP_mul);
- Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+ assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+ unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
- Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
- << std::abs(NumVGScaledBytes) << " * VG";
+ int64_t Offset =
+ MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
}
}
-// Creates an MCCFIInstruction:
-// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
-MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
- const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
- int64_t NumBytes, NumVGScaledBytes;
- AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
- NumVGScaledBytes);
+void AArch64FrameLowering::emitCalleeSavedSVELocations(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+
+ for (const auto &Info : CSI) {
+ if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+ continue;
+
+ // Not all unwinders may know about SVE registers, so assume the lowest
+ // common demoninator.
+ assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+ unsigned Reg = Info.getReg();
+ if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
+ continue;
+
+ StackOffset Offset =
+ StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+ StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
- std::string CommentBuffer = "sp";
- llvm::raw_string_ostream Comment(CommentBuffer);
+ unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+}
- // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
- SmallString<64> Expr;
- Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
- Expr.push_back(0);
- appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
- TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ emitCalleeSavedGPRLocations(MBB, MBBI);
+ emitCalleeSavedSVELocations(MBB, MBBI);
+}
- // Wrap this into DW_CFA_def_cfa.
- SmallString<64> DefCfaExpr;
- DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
- uint8_t buffer[16];
- DefCfaExpr.append(buffer,
- buffer + encodeULEB128(Expr.size(), buffer));
- DefCfaExpr.append(Expr.str());
- return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
- Comment.str());
+static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned DwarfReg) {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
+ BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
}
-MCCFIInstruction AArch64FrameLowering::createCfaOffset(
- const TargetRegisterInfo &TRI, unsigned Reg,
- const StackOffset &OffsetFromDefCFA) const {
- int64_t NumBytes, NumVGScaledBytes;
- AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
- OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
+void AArch64FrameLowering::resetCFIToInitialState(
+ MachineBasicBlock &MBB) const {
- unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+ MachineFunction &MF = *MBB.getParent();
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const auto &TRI =
+ static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
+ const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
- // Non-scalable offsets can use DW_CFA_offset directly.
- if (!NumVGScaledBytes)
- return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
+ const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
+ DebugLoc DL;
- std::string CommentBuffer;
- llvm::raw_string_ostream Comment(CommentBuffer);
- Comment << printReg(Reg, &TRI) << " @ cfa";
+ // Reset the CFA to `SP + 0`.
+ MachineBasicBlock::iterator InsertPt = MBB.begin();
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+ nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
+ BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
- // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
- SmallString<64> OffsetExpr;
- appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
- TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+ // Flip the RA sign state.
+ if (MFI.shouldSignReturnAddress()) {
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
+ }
- // Wrap this into DW_CFA_expression
- SmallString<64> CfaExpr;
- CfaExpr.push_back(dwarf::DW_CFA_expression);
- uint8_t buffer[16];
- CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
- CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
- CfaExpr.append(OffsetExpr.str());
+ // Shadow call stack uses X18, reset it.
+ if (needsShadowCallStackPrologueEpilogue(MF))
+ insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
+ TRI.getDwarfRegNum(AArch64::X18, true));
- return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
+ // Emit .cfi_same_value for callee-saved registers.
+ const std::vector<CalleeSavedInfo> &CSI =
+ MF.getFrameInfo().getCalleeSavedInfo();
+ for (const auto &Info : CSI) {
+ unsigned Reg = Info.getReg();
+ if (!TRI.regNeedsCFI(Reg, Reg))
+ continue;
+ insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
+ TRI.getDwarfRegNum(Reg, true));
+ }
}
-void AArch64FrameLowering::emitCalleeSavedFrameMoves(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ bool SVE) {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
- const TargetSubtargetInfo &STI = MF.getSubtarget();
- const TargetRegisterInfo *TRI = STI.getRegisterInfo();
- const TargetInstrInfo *TII = STI.getInstrInfo();
- DebugLoc DL = MBB.findDebugLoc(MBBI);
- // Add callee saved registers to move list.
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
return;
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
for (const auto &Info : CSI) {
- Register Reg = Info.getReg();
+ if (SVE !=
+ (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+ continue;
- // Not all unwinders may know about SVE registers, so assume the lowest
- // common demoninator.
- unsigned NewReg;
- if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
- Reg = NewReg;
- else
+ unsigned Reg = Info.getReg();
+ if (SVE &&
+ !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
- StackOffset Offset;
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- Offset =
- StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
- StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
- } else {
- Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
- getOffsetOfLocalArea());
- }
- unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+}
+
+void AArch64FrameLowering::emitCalleeSavedGPRRestores(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ emitCalleeSavedRestores(MBB, MBBI, false);
+}
+
+void AArch64FrameLowering::emitCalleeSavedSVERestores(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ emitCalleeSavedRestores(MBB, MBBI, true);
+}
+
+static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
+ switch (Reg.id()) {
+ default:
+ // The called routine is expected to preserve r19-r28
+ // r29 and r30 are used as frame pointer and link register resp.
+ return 0;
+
+ // GPRs
+#define CASE(n) \
+ case AArch64::W##n: \
+ case AArch64::X##n: \
+ return AArch64::X##n
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ CASE(17);
+ CASE(18);
+#undef CASE
+
+ // FPRs
+#define CASE(n) \
+ case AArch64::B##n: \
+ case AArch64::H##n: \
+ case AArch64::S##n: \
+ case AArch64::D##n: \
+ case AArch64::Q##n: \
+ return HasSVE ? AArch64::Z##n : AArch64::Q##n
+ CASE(0);
+ CASE(1);
+ CASE(2);
+ CASE(3);
+ CASE(4);
+ CASE(5);
+ CASE(6);
+ CASE(7);
+ CASE(8);
+ CASE(9);
+ CASE(10);
+ CASE(11);
+ CASE(12);
+ CASE(13);
+ CASE(14);
+ CASE(15);
+ CASE(16);
+ CASE(17);
+ CASE(18);
+ CASE(19);
+ CASE(20);
+ CASE(21);
+ CASE(22);
+ CASE(23);
+ CASE(24);
+ CASE(25);
+ CASE(26);
+ CASE(27);
+ CASE(28);
+ CASE(29);
+ CASE(30);
+ CASE(31);
+#undef CASE
+ }
+}
+
+void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
+ MachineBasicBlock &MBB) const {
+ // Insertion point.
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+ // Fake a debug loc.
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ const MachineFunction &MF = *MBB.getParent();
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
+
+ BitVector GPRsToZero(TRI.getNumRegs());
+ BitVector FPRsToZero(TRI.getNumRegs());
+ bool HasSVE = STI.hasSVE();
+ for (MCRegister Reg : RegsToZero.set_bits()) {
+ if (TRI.isGeneralPurposeRegister(MF, Reg)) {
+ // For GPRs, we only care to clear out the 64-bit register.
+ if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
+ GPRsToZero.set(XReg);
+ } else if (AArch64::FPR128RegClass.contains(Reg) ||
+ AArch64::FPR64RegClass.contains(Reg) ||
+ AArch64::FPR32RegClass.contains(Reg) ||
+ AArch64::FPR16RegClass.contains(Reg) ||
+ AArch64::FPR8RegClass.contains(Reg)) {
+ // For FPRs,
+ if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
+ FPRsToZero.set(XReg);
+ }
+ }
+
+ const AArch64InstrInfo &TII = *STI.getInstrInfo();
+
+ // Zero out GPRs.
+ for (MCRegister Reg : GPRsToZero.set_bits())
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0);
+
+ // Zero out FP/vector registers.
+ for (MCRegister Reg : FPRsToZero.set_bits())
+ if (HasSVE)
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::DUP_ZI_D), Reg)
+ .addImm(0)
+ .addImm(0);
+ else
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVIv2d_ns), Reg).addImm(0);
+
+ if (HasSVE) {
+ for (MCRegister PReg :
+ {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
+ AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
+ AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
+ AArch64::P15}) {
+ if (RegsToZero[PReg])
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
+ }
}
}
@@ -881,16 +1120,9 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
- bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
- // Ignore instructions that do not operate on SP, i.e. shadow call stack
- // instructions and associated CFI instruction.
- while (MBBI->getOpcode() == AArch64::STRXpost ||
- MBBI->getOpcode() == AArch64::LDRXpre ||
- MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
- if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
- assert(MBBI->getOperand(0).getReg() != AArch64::SP);
- ++MBBI;
- }
+ bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
+ MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
+ int CFAOffset = 0) {
unsigned NewOpc;
switch (MBBI->getOpcode()) {
default:
@@ -949,12 +1181,14 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// If the first store isn't right where we want SP then we can't fold the
// update in so create a normal arithmetic instruction instead.
+ MachineFunction &MF = *MBB.getParent();
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(CSStackSizeInc), TII,
- InProlog ? MachineInstr::FrameSetup
- : MachineInstr::FrameDestroy);
+ StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
+ false, false, nullptr, EmitCFI,
+ StackOffset::getFixed(CFAOffset));
+
return std::prev(MBBI);
}
@@ -981,8 +1215,15 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// Generate a new SEH code that corresponds to the new instruction.
if (NeedsWinCFI) {
*HasWinCFI = true;
- InsertSEH(*MIB, *TII,
- InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
+ InsertSEH(*MIB, *TII, FrameFlag);
+ }
+
+ if (EmitCFI) {
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(FrameFlag);
}
return std::prev(MBB.erase(MBBI));
@@ -998,16 +1239,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
return;
unsigned Opc = MI.getOpcode();
-
- // Ignore instructions that do not operate on SP, i.e. shadow call stack
- // instructions and associated CFI instruction.
- if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
- Opc == AArch64::CFI_INSTRUCTION) {
- if (Opc != AArch64::CFI_INSTRUCTION)
- assert(MI.getOperand(0).getReg() != AArch64::SP);
- return;
- }
-
unsigned Scale;
switch (Opc) {
case AArch64::STPXi:
@@ -1049,38 +1280,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
}
}
-static void adaptForLdStOpt(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator FirstSPPopI,
- MachineBasicBlock::iterator LastPopI) {
- // Sometimes (when we restore in the same order as we save), we can end up
- // with code like this:
- //
- // ldp x26, x25, [sp]
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // add sp, sp, #64
- //
- // In this case, it is always better to put the first ldp at the end, so
- // that the load-store optimizer can run and merge the ldp and the add into
- // a post-index ldp.
- // If we managed to grab the first pop instruction, move it to the end.
- if (ReverseCSRRestoreSeq)
- MBB.splice(FirstSPPopI, &MBB, LastPopI);
- // We should end up with something like this now:
- //
- // ldp x24, x23, [sp, #16]
- // ldp x22, x21, [sp, #32]
- // ldp x20, x19, [sp, #48]
- // ldp x26, x25, [sp]
- // add sp, sp, #64
- //
- // and the load-store optimizer can merge the last two instructions into:
- //
- // ldp x26, x25, [sp], #64
- //
-}
-
static bool isTargetWindows(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
}
@@ -1099,6 +1298,80 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
}
}
+static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
+ if (!(llvm::any_of(
+ MF.getFrameInfo().getCalleeSavedInfo(),
+ [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
+ MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
+ return false;
+
+ if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
+ report_fatal_error("Must reserve x18 to use shadow call stack");
+
+ return true;
+}
+
+static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool NeedsWinCFI,
+ bool NeedsUnwindInfo) {
+ // Shadow call stack prolog: str x30, [x18], #8
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
+ .addReg(AArch64::X18, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::X18)
+ .addImm(8)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // This instruction also makes x18 live-in to the entry block.
+ MBB.addLiveIn(AArch64::X18);
+
+ if (NeedsWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (NeedsUnwindInfo) {
+ // Emit a CFI instruction that causes 8 to be subtracted from the value of
+ // x18 when unwinding past this frame.
+ static const char CFIInst[] = {
+ dwarf::DW_CFA_val_expression,
+ 18, // register
+ 2, // length
+ static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+ static_cast<char>(-8) & 0x7f, // addend (sleb128)
+ };
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
+ nullptr, StringRef(CFIInst, sizeof(CFIInst))));
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+}
+
+static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) {
+ // Shadow call stack epilog: ldr x30, [x18, #-8]!
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
+ .addReg(AArch64::X18, RegState::Define)
+ .addReg(AArch64::LR, RegState::Define)
+ .addReg(AArch64::X18)
+ .addImm(-8)
+ .setMIFlag(MachineInstr::FrameDestroy);
+
+ if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -1109,8 +1382,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- bool needsFrameMoves =
- MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool EmitCFI = AFI->needsDwarfUnwindInfo();
bool HasFP = hasFP(MF);
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
@@ -1128,8 +1400,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
DebugLoc DL;
const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
- if (MFnI.shouldSignReturnAddress()) {
+ if (needsShadowCallStackPrologueEpilogue(MF))
+ emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
+ MFnI.needsDwarfUnwindInfo());
+ if (MFnI.shouldSignReturnAddress()) {
unsigned PACI;
if (MFnI.shouldSignWithBKey()) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
@@ -1145,12 +1420,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(AArch64::LR)
.addReg(AArch64::SP, RegState::InternalRead);
MI.setMIFlag(MachineInstr::FrameSetup);
-
- unsigned CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
+ if (EmitCFI) {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
+ if (EmitCFI && MFnI.isMTETagged()) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
+ .setMIFlag(MachineInstr::FrameSetup);
}
// We signal the presence of a Swift extended frame to external tools by
@@ -1227,7 +1507,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(-NumBytes), TII,
MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
- if (needsFrameMoves) {
+ if (EmitCFI) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
// Encode the stack size of the leaf function.
@@ -1261,14 +1541,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(-NumBytes), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
+ EmitCFI);
NumBytes = 0;
} else if (HomPrologEpilog) {
// Stack has been already adjusted.
NumBytes -= PrologueSaveSize;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
+ MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
+ EmitCFI);
NumBytes -= PrologueSaveSize;
}
assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -1322,8 +1604,27 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
StackOffset::getFixed(FPOffset), TII,
MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
}
+ if (EmitCFI) {
+ // Define the current CFA rule to use the provided FP.
+ const int OffsetToFirstCalleeSaveFromFP =
+ AFI->getCalleeSaveBaseToFrameRecordOffset() -
+ AFI->getCalleeSavedStackSize();
+ Register FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+ nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
}
+ // Now emit the moves for whatever callee saved regs we have (including FP,
+ // LR if those are saved). Frame instructions for SVE register are emitted
+ // later, after the instruction which actually save SVE regs.
+ if (EmitCFI)
+ emitCalleeSavedGPRLocations(MBB, MBBI);
+
if (windowsRequiresStackProbe(MF, NumBytes)) {
uint64_t NumWords = NumBytes >> 4;
if (NeedsWinCFI) {
@@ -1436,14 +1737,21 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
// Allocate space for the callee saves (if any).
- emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
- -AllocateBefore, TII,
- MachineInstr::FrameSetup);
+ emitFrameOffset(
+ MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
+ MachineInstr::FrameSetup, false, false, nullptr,
+ EmitCFI && !HasFP && AllocateBefore,
+ StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+
+ if (EmitCFI)
+ emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
// Finally allocate remaining SVE stack space.
emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
- -AllocateAfter, TII,
- MachineInstr::FrameSetup);
+ -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
+ nullptr, EmitCFI && !HasFP && AllocateAfter,
+ AllocateBefore + StackOffset::getFixed(
+ (int64_t)MFI.getStackSize() - NumBytes));
// Allocate space for the rest of the frame.
if (NumBytes) {
@@ -1458,14 +1766,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
// If we're a leaf function, try using the red zone.
- if (!canUseRedZone(MF))
+ if (!canUseRedZone(MF)) {
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
- emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
- StackOffset::getFixed(-NumBytes), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-
+ emitFrameOffset(
+ MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+ StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
+ false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+ SVEStackSize +
+ StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+ }
if (NeedsRealignment) {
const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
assert(NrBitsToZero > 1);
@@ -1532,109 +1843,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MBB.addLiveIn(AArch64::X1);
}
}
-
- if (needsFrameMoves) {
- // An example of the prologue:
- //
- // .globl __foo
- // .align 2
- // __foo:
- // Ltmp0:
- // .cfi_startproc
- // .cfi_personality 155, ___gxx_personality_v0
- // Leh_func_begin:
- // .cfi_lsda 16, Lexception33
- //
- // stp xa,bx, [sp, -#offset]!
- // ...
- // stp x28, x27, [sp, #offset-32]
- // stp fp, lr, [sp, #offset-16]
- // add fp, sp, #offset - 16
- // sub sp, sp, #1360
- //
- // The Stack:
- // +-------------------------------------------+
- // 10000 | ........ | ........ | ........ | ........ |
- // 10004 | ........ | ........ | ........ | ........ |
- // +-------------------------------------------+
- // 10008 | ........ | ........ | ........ | ........ |
- // 1000c | ........ | ........ | ........ | ........ |
- // +===========================================+
- // 10010 | X28 Register |
- // 10014 | X28 Register |
- // +-------------------------------------------+
- // 10018 | X27 Register |
- // 1001c | X27 Register |
- // +===========================================+
- // 10020 | Frame Pointer |
- // 10024 | Frame Pointer |
- // +-------------------------------------------+
- // 10028 | Link Register |
- // 1002c | Link Register |
- // +===========================================+
- // 10030 | ........ | ........ | ........ | ........ |
- // 10034 | ........ | ........ | ........ | ........ |
- // +-------------------------------------------+
- // 10038 | ........ | ........ | ........ | ........ |
- // 1003c | ........ | ........ | ........ | ........ |
- // +-------------------------------------------+
- //
- // [sp] = 10030 :: >>initial value<<
- // sp = 10020 :: stp fp, lr, [sp, #-16]!
- // fp = sp == 10020 :: mov fp, sp
- // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
- // sp == 10010 :: >>final value<<
- //
- // The frame pointer (w29) points to address 10020. If we use an offset of
- // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
- // for w27, and -32 for w28:
- //
- // Ltmp1:
- // .cfi_def_cfa w29, 16
- // Ltmp2:
- // .cfi_offset w30, -8
- // Ltmp3:
- // .cfi_offset w29, -16
- // Ltmp4:
- // .cfi_offset w27, -24
- // Ltmp5:
- // .cfi_offset w28, -32
-
- if (HasFP) {
- const int OffsetToFirstCalleeSaveFromFP =
- AFI->getCalleeSaveBaseToFrameRecordOffset() -
- AFI->getCalleeSavedStackSize();
- Register FramePtr = RegInfo->getFrameRegister(MF);
-
- // Define the current CFA rule to use the provided FP.
- unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
- } else {
- unsigned CFIIndex;
- if (SVEStackSize) {
- const TargetSubtargetInfo &STI = MF.getSubtarget();
- const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
- StackOffset TotalSize =
- SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
- CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
- } else {
- // Encode the stack size of the leaf function.
- CFIIndex = MF.addFrameInst(
- MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
- }
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
- }
-
- // Now emit the moves for whatever callee saved regs we have (including FP,
- // LR if those are saved).
- emitCalleeSavedFrameMoves(MBB, MBBI);
- }
}
static void InsertReturnAddressAuth(MachineFunction &MF,
@@ -1653,7 +1861,8 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
// The AUTIASP instruction assembles to a hint instruction before v8.3a so
// this instruction can safely used for any v8a architecture.
// From v8.3a onwards there are optimised authenticate LR and return
- // instructions, namely RETA{A,B}, that can be used instead.
+ // instructions, namely RETA{A,B}, that can be used instead. In this case the
+ // DW_CFA_AARCH64_negate_ra_state can't be emitted.
if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
MBBI->getOpcode() == AArch64::RET_ReallyLR) {
BuildMI(MBB, MBBI, DL,
@@ -1665,6 +1874,12 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
MBB, MBBI, DL,
TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
.setMIFlag(MachineInstr::FrameDestroy);
+
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameDestroy);
}
}
@@ -1686,6 +1901,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool NeedsWinCFI = needsWinCFI(MF);
+ bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo();
bool HasWinCFI = false;
bool IsFunclet = false;
auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
@@ -1695,6 +1911,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
IsFunclet = isFuncletReturnInstr(*MBBI);
}
+ auto FinishingTouches = make_scope_exit([&]() {
+ InsertReturnAddressAuth(MF, MBB);
+ if (needsShadowCallStackPrologueEpilogue(MF))
+ emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
+ if (EmitCFI)
+ emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
+ });
+
int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
: MFI.getStackSize();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -1707,36 +1931,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// How much of the stack used by incoming arguments this function is expected
// to restore in this particular epilogue.
int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
-
- // The stack frame should be like below,
- //
- // ---------------------- ---
- // | | |
- // | BytesInStackArgArea| CalleeArgStackSize
- // | (NumReusableBytes) | (of tail call)
- // | | ---
- // | | |
- // ---------------------| --- |
- // | | | |
- // | CalleeSavedReg | | |
- // | (CalleeSavedStackSize)| | |
- // | | | |
- // ---------------------| | NumBytes
- // | | StackSize (StackAdjustUp)
- // | LocalStackSize | | |
- // | (covering callee | | |
- // | args) | | |
- // | | | |
- // ---------------------- --- ---
- //
- // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
- // = StackSize + ArgumentPopSize
- //
- // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
- // it as the 2nd argument of AArch64ISD::TC_RETURN.
-
- auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
-
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
@@ -1771,9 +1965,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
+ bool CombineAfterCSRBump = false;
if (!CombineSPBump && PrologueSaveSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
- while (AArch64InstrInfo::isSEHInstruction(*Pop))
+ while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
+ AArch64InstrInfo::isSEHInstruction(*Pop))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
@@ -1781,15 +1977,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// If the offset is 0 and the AfterCSR pop is not actually trying to
// allocate more stack for arguments (in space that an untimely interrupt
// may clobber), convert it to a post-index ldp.
- if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
+ if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
- else {
+ MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
+ MachineInstr::FrameDestroy, PrologueSaveSize);
+ } else {
// If not, make sure to emit an add after the last ldp.
// We're doing this by transfering the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
AfterCSRPopSize += PrologueSaveSize;
+ CombineAfterCSRBump = true;
}
}
@@ -1822,15 +2020,27 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
}
if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
- // We need to reset FP to its untagged state on return. Bit 60 is currently
- // used to show the presence of an extended frame.
-
- // BIC x29, x29, #0x1000_0000_0000_0000
- BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
- AArch64::FP)
- .addUse(AArch64::FP)
- .addImm(0x10fe)
- .setMIFlag(MachineInstr::FrameDestroy);
+ switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+ case SwiftAsyncFramePointerMode::DeploymentBased:
+ // Avoid the reload as it is GOT relative, and instead fall back to the
+ // hardcoded value below. This allows a mismatch between the OS and
+ // application without immediately terminating on the difference.
+ LLVM_FALLTHROUGH;
+ case SwiftAsyncFramePointerMode::Always:
+ // We need to reset FP to its untagged state on return. Bit 60 is
+ // currently used to show the presence of an extended frame.
+
+ // BIC x29, x29, #0x1000_0000_0000_0000
+ BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
+ AArch64::FP)
+ .addUse(AArch64::FP)
+ .addImm(0x10fe)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ break;
+
+ case SwiftAsyncFramePointerMode::Never:
+ break;
+ }
}
const StackOffset &SVEStackSize = getSVEStackSize(MF);
@@ -1838,10 +2048,22 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+
+ // When we are about to restore the CSRs, the CFA register is SP again.
+ if (EmitCFI && hasFP(MF)) {
+ const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+ unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
+ BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
- &HasWinCFI);
+ &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
if (HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
@@ -1873,30 +2095,44 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Deallocate the SVE area.
if (SVEStackSize) {
- if (AFI->isStackRealigned()) {
- if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
+ // If we have stack realignment or variable sized objects on the stack,
+ // restore the stack pointer from the frame pointer prior to SVE CSR
+ // restoration.
+ if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
// Set SP to start of SVE callee-save area from which they can
// be reloaded. The code below will deallocate the stack space
// space by moving FP -> SP.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
StackOffset::getScalable(-CalleeSavedSize), TII,
MachineInstr::FrameDestroy);
+ }
} else {
if (AFI->getSVECalleeSavedStackSize()) {
// Deallocate the non-SVE locals first before we can deallocate (and
// restore callee saves) from the SVE area.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(NumBytes), TII,
- MachineInstr::FrameDestroy);
+ emitFrameOffset(
+ MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
+ false, false, nullptr, EmitCFI && !hasFP(MF),
+ SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
NumBytes = 0;
}
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- DeallocateBefore, TII, MachineInstr::FrameDestroy);
+ DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
+ false, nullptr, EmitCFI && !hasFP(MF),
+ SVEStackSize +
+ StackOffset::getFixed(NumBytes + PrologueSaveSize));
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- DeallocateAfter, TII, MachineInstr::FrameDestroy);
+ DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+ false, nullptr, EmitCFI && !hasFP(MF),
+ DeallocateAfter +
+ StackOffset::getFixed(NumBytes + PrologueSaveSize));
}
+ if (EmitCFI)
+ emitCalleeSavedSVERestores(MBB, RestoreEnd);
}
if (!hasFP(MF)) {
@@ -1906,23 +2142,24 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
if (RedZone && AfterCSRPopSize == 0)
return;
+ // Pop the local variables off the stack. If there are no callee-saved
+ // registers, it means we are actually positioned at the terminator and can
+ // combine stack increment for the locals and the stack increment for
+ // callee-popped arguments into (possibly) a single instruction and be done.
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
StackRestoreBytes += AfterCSRPopSize;
+ emitFrameOffset(
+ MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(StackRestoreBytes), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+ StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
+
// If we were able to combine the local stack pop with the argument pop,
// then we're done.
- bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
-
- // If we're done after this, make sure to help the load store optimizer.
- if (Done)
- adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
-
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(StackRestoreBytes), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
- if (Done) {
+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
if (HasWinCFI) {
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
@@ -1948,29 +2185,29 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
StackOffset::getFixed(NumBytes), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ // When we are about to restore the CSRs, the CFA register is SP again.
+ if (EmitCFI && hasFP(MF)) {
+ const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+ unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
+ BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
// code in the prologue.
if (AfterCSRPopSize) {
assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
"interrupt may have clobbered");
- // Find an insertion point for the first ldp so that it goes before the
- // shadow call stack epilog instruction. This ensures that the restore of
- // lr from x18 is placed after the restore from sp.
- auto FirstSPPopI = MBB.getFirstTerminator();
- while (FirstSPPopI != Begin) {
- auto Prev = std::prev(FirstSPPopI);
- if (Prev->getOpcode() != AArch64::LDRXpre ||
- Prev->getOperand(0).getReg() == AArch64::SP)
- break;
- FirstSPPopI = Prev;
- }
- adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
-
- emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AfterCSRPopSize), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(
+ MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
+ false, NeedsWinCFI, &HasWinCFI, EmitCFI,
+ StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
}
if (HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
@@ -2061,8 +2298,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
// right thing for the emergency spill slot.
bool UseFP = false;
if (AFI->hasStackFrame() && !isSVE) {
- // We shouldn't prefer using the FP when there is an SVE area
- // in between the FP and the non-SVE locals/spills.
+ // We shouldn't prefer using the FP to access fixed-sized stack objects when
+ // there are scalable (SVE) objects in between the FP and the fixed-sized
+ // objects.
PreferFP &= !SVEStackSize;
// Note: Keeping the following as multiple 'if' statements rather than
@@ -2083,7 +2321,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
// offsets is smaller than for positive ones. If an offset is available
// via the FP and the SP, use whichever is closest.
bool FPOffsetFits = !ForSimm || FPOffset >= -256;
- PreferFP |= Offset > -FPOffset;
+ PreferFP |= Offset > -FPOffset && !SVEStackSize;
if (MFI.hasVarSizedObjects()) {
// If we have variable sized objects, we can use either FP or BP, as the
@@ -2270,7 +2508,7 @@ struct RegPairInfo {
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
- bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
+ bool NeedsFrameRecord) {
if (CSI.empty())
return;
@@ -2349,15 +2587,6 @@ static void computeCalleeSaveRegisterPairs(
}
}
- // If either of the registers to be saved is the lr register, it means that
- // we also need to save lr in the shadow call stack.
- if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
- MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
- if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
- report_fatal_error("Must reserve x18 to use shadow call stack");
- NeedShadowCallStackProlog = true;
- }
-
// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
// list to come in sorted by frame index so that we can issue the store
// pair instructions directly. Assert if we see anything otherwise.
@@ -2476,43 +2705,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
- bool NeedShadowCallStackProlog = false;
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog, hasFP(MF));
- const MachineRegisterInfo &MRI = MF.getRegInfo();
-
- if (NeedShadowCallStackProlog) {
- // Shadow call stack prolog: str x30, [x18], #8
- BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
- .addReg(AArch64::X18, RegState::Define)
- .addReg(AArch64::LR)
- .addReg(AArch64::X18)
- .addImm(8)
- .setMIFlag(MachineInstr::FrameSetup);
-
- if (NeedsWinCFI)
- BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
-
- // Emit a CFI instruction that causes 8 to be subtracted from the value of
- // x18 when unwinding past this frame.
- static const char CFIInst[] = {
- dwarf::DW_CFA_val_expression,
- 18, // register
- 2, // length
- static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
- static_cast<char>(-8) & 0x7f, // addend (sleb128)
- };
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
- nullptr, StringRef(CFIInst, sizeof(CFIInst))));
- BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlag(MachineInstr::FrameSetup);
-
- // This instruction also makes x18 live-in to the entry block.
- MBB.addLiveIn(AArch64::X18);
- }
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
if (homogeneousPrologEpilog(MF)) {
auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
.setMIFlag(MachineInstr::FrameSetup);
@@ -2622,7 +2817,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
bool AArch64FrameLowering::restoreCalleeSavedRegisters(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -2630,14 +2825,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
SmallVector<RegPairInfo, 8> RegPairs;
bool NeedsWinCFI = needsWinCFI(MF);
- if (MI != MBB.end())
- DL = MI->getDebugLoc();
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
- bool NeedShadowCallStackProlog = false;
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog, hasFP(MF));
+ computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) {
+ auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -2694,7 +2887,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
MIB.addReg(Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
@@ -2711,6 +2904,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+
+ return MIB->getIterator();
};
// SVE objects are always restored in reverse order.
@@ -2718,31 +2913,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
if (RPI.isScalable())
EmitMI(RPI);
- if (ReverseCSRRestoreSeq) {
- for (const RegPairInfo &RPI : reverse(RegPairs))
- if (!RPI.isScalable())
- EmitMI(RPI);
- } else if (homogeneousPrologEpilog(MF, &MBB)) {
- auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+ if (homogeneousPrologEpilog(MF, &MBB)) {
+ auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
for (auto &RPI : RegPairs) {
MIB.addReg(RPI.Reg1, RegState::Define);
MIB.addReg(RPI.Reg2, RegState::Define);
}
return true;
- } else
- for (const RegPairInfo &RPI : RegPairs)
- if (!RPI.isScalable())
- EmitMI(RPI);
-
- if (NeedShadowCallStackProlog) {
- // Shadow call stack epilog: ldr x30, [x18, #-8]!
- BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
- .addReg(AArch64::X18, RegState::Define)
- .addReg(AArch64::LR, RegState::Define)
- .addReg(AArch64::X18)
- .addImm(-8)
- .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
+ if (ReverseCSRRestoreSeq) {
+ MachineBasicBlock::iterator First = MBB.end();
+ for (const RegPairInfo &RPI : reverse(RegPairs)) {
+ if (RPI.isScalable())
+ continue;
+ MachineBasicBlock::iterator It = EmitMI(RPI);
+ if (First == MBB.end())
+ First = It;
+ }
+ if (First != MBB.end())
+ MBB.splice(MBBI, &MBB, First);
+ } else {
+ for (const RegPairInfo &RPI : RegPairs) {
+ if (RPI.isScalable())
+ continue;
+ (void)EmitMI(RPI);
+ }
}
return true;
@@ -2941,6 +3138,15 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
// stack slots for them.
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ bool UsesWinAAPCS = isTargetWindows(MF);
+ if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
+ int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
+ AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
+ if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+ if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
@@ -2954,7 +3160,8 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
// Grab 8 bytes below FP for the extended asynchronous frame info.
- if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
+ if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
+ Reg == AArch64::FP) {
FrameIdx = MFI.CreateStackObject(8, Alignment, true);
AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
@@ -3190,7 +3397,7 @@ public:
// instructions. May skip if the replacement is not profitable. May invalidate
// the input iterator and replace it with a valid one.
void emitCode(MachineBasicBlock::iterator &InsertI,
- const AArch64FrameLowering *TFI, bool IsLast);
+ const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
};
void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
@@ -3329,7 +3536,8 @@ void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
}
void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
- const AArch64FrameLowering *TFI, bool IsLast) {
+ const AArch64FrameLowering *TFI,
+ bool TryMergeSPUpdate) {
if (TagStores.empty())
return;
TagStoreInstr &FirstTagStore = TagStores[0];
@@ -3359,8 +3567,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
emitUnrolled(InsertI);
} else {
MachineInstr *UpdateInstr = nullptr;
- int64_t TotalOffset;
- if (IsLast) {
+ int64_t TotalOffset = 0;
+ if (TryMergeSPUpdate) {
// See if we can merge base register update into the STGloop.
// This is done in AArch64LoadStoreOptimizer for "normal" stores,
// but STGloop is way too unusual for that, and also it only
@@ -3505,7 +3713,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
for (auto &Instr : Instrs) {
if (EndOffset && *EndOffset != Instr.Offset) {
// Found a gap.
- TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+ TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
TSE.clear();
}
@@ -3513,7 +3721,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
EndOffset = Instr.Offset + Instr.Size;
}
- TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+ // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
+ TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
+ !MBB->getParent()
+ ->getInfo<AArch64FunctionInfo>()
+ ->needsAsyncDwarfUnwindInfo());
return InsertI;
}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 31f57cbc49f2..f59860a24d9b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -29,6 +29,8 @@ public:
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const;
+ void resetCFIToInitialState(MachineBasicBlock &MBB) const override;
+
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const override;
@@ -141,13 +143,20 @@ private:
int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
int &MinCSFrameIndex,
int &MaxCSFrameIndex) const;
- MCCFIInstruction
- createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI,
- const StackOffset &OffsetFromSP) const;
- MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg,
- const StackOffset &OffsetFromDefCFA) const;
bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
unsigned StackBumpBytes) const;
+ void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const;
+ void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const;
+ void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const;
+ void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const;
+
+ /// Emit target zero call-used regs.
+ void emitZeroCallUsedRegs(BitVector RegsToZero,
+ MachineBasicBlock &MBB) const override;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 899f069abdd4..82fe5772c99d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -159,6 +159,22 @@ public:
return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
}
+ bool SelectExtractHigh(SDValue N, SDValue &Res) {
+ if (Subtarget->isLittleEndian() && N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
+ if (N->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isa<ConstantSDNode>(N->getOperand(1)))
+ return false;
+ EVT VT = N->getValueType(0);
+ EVT LVT = N->getOperand(0).getValueType();
+ unsigned Index = N->getConstantOperandVal(1);
+ if (!VT.is64BitVector() || !LVT.is128BitVector() ||
+ Index != VT.getVectorNumElements())
+ return false;
+ Res = N->getOperand(0);
+ return true;
+ }
+
bool SelectDupZeroOrUndef(SDValue N) {
switch(N->getOpcode()) {
case ISD::UNDEF:
@@ -204,6 +220,11 @@ public:
return SelectSVEAddSubImm(N, VT, Imm, Shift);
}
+ template <MVT::SimpleValueType VT>
+ bool SelectSVECpyDupImm(SDValue N, SDValue &Imm, SDValue &Shift) {
+ return SelectSVECpyDupImm(N, VT, Imm, Shift);
+ }
+
template <MVT::SimpleValueType VT, bool Invert = false>
bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
return SelectSVELogicalImm(N, VT, Imm, Invert);
@@ -219,6 +240,16 @@ public:
return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
}
+ bool SelectSVEShiftSplatImmR(SDValue N, SDValue &Imm) {
+ if (N->getOpcode() != ISD::SPLAT_VECTOR)
+ return false;
+
+ EVT EltVT = N->getValueType(0).getVectorElementType();
+ return SelectSVEShiftImm(N->getOperand(0), /* Low */ 1,
+ /* High */ EltVT.getFixedSizeInBits(),
+ /* AllowSaturation */ true, Imm);
+ }
+
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
template<signed Min, signed Max, signed Scale, bool Shift>
bool SelectCntImm(SDValue N, SDValue &Imm) {
@@ -257,6 +288,15 @@ public:
return false;
}
+ template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
+ if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t C = CI->getZExtValue();
+ Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
+ return true;
+ }
+ return false;
+ }
+
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
/// between 1 and 4 elements. If it contains a single element that is returned
@@ -300,6 +340,11 @@ public:
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}
+ template <unsigned Scale>
+ bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
+ return SelectSMETileSlice(N, Scale, Vector, Offset);
+ }
+
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
@@ -357,10 +402,8 @@ private:
bool SelectCMP_SWAP(SDNode *N);
- bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
-
bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
-
+ bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert);
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
@@ -370,6 +413,8 @@ private:
bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
+ bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
+ SDValue &Offset);
bool SelectAllActivePredicate(SDValue N);
};
@@ -822,9 +867,17 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
Reg = N.getOperand(0);
- // Don't match if free 32-bit -> 64-bit zext can be used instead.
- if (Ext == AArch64_AM::UXTW &&
- Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
+ // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
+ // isDef32 as a heuristic for when the operand is likely to be a 32bit def.
+ auto isDef32 = [](SDValue N) {
+ unsigned Opc = N.getOpcode();
+ return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
+ Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
+ Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
+ Opc != ISD::FREEZE;
+ };
+ if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 &&
+ isDef32(Reg))
return false;
}
@@ -1852,6 +1905,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
VT = Opd0->getValueType(0);
} else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
Opd0 = Op0->getOperand(0);
+ ClampMSB = (VT == MVT::i32);
} else if (BiggerPattern) {
// Let's pretend a 0 shift right has been performed.
// The resulting code will be at least as good as the original one
@@ -2710,8 +2764,16 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
// shift the needed bits into place.
SDLoc DL(N);
unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+ uint64_t LsrImm = LSB;
+ if (Src->hasOneUse() &&
+ isOpcWithIntImmediate(Src.getNode(), ISD::SRL, LsrImm) &&
+ (LsrImm + LSB) < BitWidth) {
+ Src = Src->getOperand(0);
+ LsrImm += LSB;
+ }
+
SDNode *LSR = CurDAG->getMachineNode(
- ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+ ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LsrImm, DL, VT),
CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
// BFXIL is an alias of BFM, so translate to BFM operands.
@@ -2827,15 +2889,15 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
SDValue Add1 = ShiftAmt->getOperand(1);
uint64_t Add0Imm;
uint64_t Add1Imm;
- // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
- // to avoid the ADD/SUB.
- if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
+ if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0)) {
+ // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
+ // to avoid the ADD/SUB.
NewShiftAmt = Add0;
- // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
- // generate a NEG instead of a SUB of a constant.
- else if (ShiftAmt->getOpcode() == ISD::SUB &&
- isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
- (Add0Imm % Size == 0)) {
+ } else if (ShiftAmt->getOpcode() == ISD::SUB &&
+ isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
+ (Add0Imm % Size == 0)) {
+ // If we are shifting by N-X where N == 0 mod Size, then just shift by -X
+ // to generate a NEG instead of a SUB from a constant.
unsigned NegOpc;
unsigned ZeroReg;
EVT SubVT = ShiftAmt->getValueType(0);
@@ -2852,6 +2914,26 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
MachineSDNode *Neg =
CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
NewShiftAmt = SDValue(Neg, 0);
+ } else if (ShiftAmt->getOpcode() == ISD::SUB &&
+ isIntImmediate(Add0, Add0Imm) && (Add0Imm % Size == Size - 1)) {
+ // If we are shifting by N-X where N == -1 mod Size, then just shift by ~X
+ // to generate a NOT instead of a SUB from a constant.
+ unsigned NotOpc;
+ unsigned ZeroReg;
+ EVT SubVT = ShiftAmt->getValueType(0);
+ if (SubVT == MVT::i32) {
+ NotOpc = AArch64::ORNWrr;
+ ZeroReg = AArch64::WZR;
+ } else {
+ assert(SubVT == MVT::i64);
+ NotOpc = AArch64::ORNXrr;
+ ZeroReg = AArch64::XZR;
+ }
+ SDValue Zero =
+ CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
+ MachineSDNode *Not =
+ CurDAG->getMachineNode(NotOpc, DL, SubVT, Zero, Add1);
+ NewShiftAmt = SDValue(Not, 0);
} else
return false;
} else {
@@ -3108,72 +3190,81 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
return true;
}
-bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
- SDValue &Offset) {
- auto C = dyn_cast<ConstantSDNode>(N);
- if (!C)
+bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm,
+ SDValue &Shift) {
+ if (!isa<ConstantSDNode>(N))
return false;
- auto Ty = N->getValueType(0);
-
- int64_t Imm = C->getSExtValue();
SDLoc DL(N);
-
- if ((Imm >= -128) && (Imm <= 127)) {
- Base = CurDAG->getTargetConstant(Imm, DL, Ty);
- Offset = CurDAG->getTargetConstant(0, DL, Ty);
- return true;
- }
-
- if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
- Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
- Offset = CurDAG->getTargetConstant(8, DL, Ty);
+ uint64_t Val = cast<ConstantSDNode>(N)
+ ->getAPIntValue()
+ .trunc(VT.getFixedSizeInBits())
+ .getZExtValue();
+
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ // All immediates are supported.
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
return true;
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ // Support 8bit unsigned immediates.
+ if (Val <= 255) {
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+ return true;
+ }
+ // Support 16bit unsigned immediates that are a multiple of 256.
+ if (Val <= 65280 && Val % 256 == 0) {
+ Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32);
+ return true;
+ }
+ break;
+ default:
+ break;
}
return false;
}
-bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
- if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
- const int64_t ImmVal = CNode->getSExtValue();
- SDLoc DL(N);
+bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm,
+ SDValue &Shift) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
- switch (VT.SimpleTy) {
- case MVT::i8:
- // Can always select i8s, no shift, mask the immediate value to
- // deal with sign-extended value from lowering.
+ SDLoc DL(N);
+ int64_t Val = cast<ConstantSDNode>(N)
+ ->getAPIntValue()
+ .trunc(VT.getFixedSizeInBits())
+ .getSExtValue();
+
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ // All immediates are supported.
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
+ return true;
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ // Support 8bit signed immediates.
+ if (Val >= -128 && Val <= 127) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(ImmVal & 0xFF, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
+ return true;
+ }
+ // Support 16bit signed immediates that are a multiple of 256.
+ if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
+ Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
return true;
- case MVT::i16:
- // i16 values get sign-extended to 32-bits during lowering.
- if ((ImmVal & 0xFF) == ImmVal) {
- Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
- return true;
- } else if ((ImmVal & 0xFF) == 0) {
- Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32);
- return true;
- }
- break;
- case MVT::i32:
- case MVT::i64:
- // Range of immediate won't trigger signedness problems for 32/64b.
- if ((ImmVal & 0xFF) == ImmVal) {
- Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
- return true;
- } else if ((ImmVal & 0xFF00) == ImmVal) {
- Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
- return true;
- }
- break;
- default:
- break;
}
+ break;
+ default:
+ break;
}
return false;
@@ -3901,7 +3992,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
true);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H,
true);
return;
@@ -3922,7 +4013,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
true);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H,
true);
return;
@@ -3943,7 +4034,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
true);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H,
true);
return;
@@ -4267,7 +4358,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4284,7 +4375,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4301,7 +4392,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4911,7 +5002,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4928,7 +5019,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -4945,7 +5036,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
return;
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
- (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+ VT == MVT::nxv8bf16) {
SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
return;
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
@@ -5033,6 +5124,10 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
const unsigned IntNo =
cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
+ if (IntNo == Intrinsic::aarch64_sme_ldr ||
+ IntNo == Intrinsic::aarch64_sme_str)
+ return MVT::nxv16i8;
+
if (IntNo != Intrinsic::aarch64_sve_prf)
return EVT();
@@ -5051,12 +5146,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
SDValue &OffImm) {
const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
const DataLayout &DL = CurDAG->getDataLayout();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
if (N.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(N)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
- return true;
+ // We can only encode VL scaled offsets, so only fold in frame indexes
+ // referencing SVE objects.
+ if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ return false;
}
if (MemVT == EVT())
@@ -5083,7 +5185,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ // We can only encode VL scaled offsets, so only fold in frame indexes
+ // referencing SVE objects.
+ if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector)
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
}
OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
@@ -5149,3 +5254,30 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
return TLI->isAllActivePredicate(*CurDAG, N);
}
+
+bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
+ SDValue &Base, SDValue &Offset) {
+ if (N.getOpcode() != ISD::ADD) {
+ Base = N;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ // Process an ADD node.
+ const SDValue LHS = N.getOperand(0);
+ const SDValue RHS = N.getOperand(1);
+
+ if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
+ int64_t ImmOff = C->getSExtValue();
+ unsigned MaxSize = (1 << Scale) - 1;
+
+ if (ImmOff < 0 || ImmOff > MaxSize)
+ return false;
+
+ Base = LHS;
+ Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c539c8617d99..abfe2d507111 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -208,6 +208,7 @@ static bool isMergePassthruOpcode(unsigned Opc) {
case AArch64ISD::BSWAP_MERGE_PASSTHRU:
case AArch64ISD::REVH_MERGE_PASSTHRU:
case AArch64ISD::REVW_MERGE_PASSTHRU:
+ case AArch64ISD::REVD_MERGE_PASSTHRU:
case AArch64ISD::CTLZ_MERGE_PASSTHRU:
case AArch64ISD::CTPOP_MERGE_PASSTHRU:
case AArch64ISD::DUP_MERGE_PASSTHRU:
@@ -289,8 +290,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8bf16);
}
- if (Subtarget->hasSVE()) {
+ if (Subtarget->hasSVE() || Subtarget->hasSME()) {
// Add legal sve predicate types
+ addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
@@ -324,50 +326,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
if (useSVEForFixedLengthVectorVT(VT))
addRegisterClass(VT, &AArch64::ZPRRegClass);
}
-
- for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
- setOperationAction(ISD::SADDSAT, VT, Legal);
- setOperationAction(ISD::UADDSAT, VT, Legal);
- setOperationAction(ISD::SSUBSAT, VT, Legal);
- setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::UREM, VT, Expand);
- setOperationAction(ISD::SREM, VT, Expand);
- setOperationAction(ISD::SDIVREM, VT, Expand);
- setOperationAction(ISD::UDIVREM, VT, Expand);
- }
-
- for (auto VT :
- { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
- MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
-
- for (auto VT :
- { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
- MVT::nxv2f64 }) {
- setCondCodeAction(ISD::SETO, VT, Expand);
- setCondCodeAction(ISD::SETOLT, VT, Expand);
- setCondCodeAction(ISD::SETLT, VT, Expand);
- setCondCodeAction(ISD::SETOLE, VT, Expand);
- setCondCodeAction(ISD::SETLE, VT, Expand);
- setCondCodeAction(ISD::SETULT, VT, Expand);
- setCondCodeAction(ISD::SETULE, VT, Expand);
- setCondCodeAction(ISD::SETUGE, VT, Expand);
- setCondCodeAction(ISD::SETUGT, VT, Expand);
- setCondCodeAction(ISD::SETUEQ, VT, Expand);
- setCondCodeAction(ISD::SETUNE, VT, Expand);
-
- setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOW, VT, Expand);
- setOperationAction(ISD::FPOWI, VT, Expand);
- setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FSIN, VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- setOperationAction(ISD::FEXP, VT, Expand);
- setOperationAction(ISD::FEXP2, VT, Expand);
- setOperationAction(ISD::FLOG, VT, Expand);
- setOperationAction(ISD::FLOG2, VT, Expand);
- setOperationAction(ISD::FLOG10, VT, Expand);
- }
}
// Compute derived properties from the register classes
@@ -389,7 +347,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
- setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::BR_CC, MVT::f16, Custom);
@@ -448,6 +406,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
+ // aren't handled.
// Lowering for many of the conversions is actually specified by the non-f128
// type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -508,16 +468,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// BlockAddress
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
- // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
- setOperationAction(ISD::ADDC, MVT::i32, Custom);
- setOperationAction(ISD::ADDE, MVT::i32, Custom);
- setOperationAction(ISD::SUBC, MVT::i32, Custom);
- setOperationAction(ISD::SUBE, MVT::i32, Custom);
- setOperationAction(ISD::ADDC, MVT::i64, Custom);
- setOperationAction(ISD::ADDE, MVT::i64, Custom);
- setOperationAction(ISD::SUBC, MVT::i64, Custom);
- setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -568,6 +518,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMULO, MVT::i32, Custom);
setOperationAction(ISD::UMULO, MVT::i64, Custom);
+ setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
+ setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+ setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+ setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
+ setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
+ setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
+ setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
+
setOperationAction(ISD::FSIN, MVT::f32, Expand);
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Expand);
@@ -581,64 +540,41 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
else
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::FREM, MVT::v4f16, Expand);
- setOperationAction(ISD::FREM, MVT::v8f16, Expand);
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
- setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
- setOperationAction(ISD::FPOWI, MVT::f16, Promote);
- setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
- setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
- setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
- setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
- setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
- setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
- setOperationAction(ISD::FEXP, MVT::f16, Promote);
- setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
- setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
- setOperationAction(ISD::FEXP2, MVT::f16, Promote);
- setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
- setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
- setOperationAction(ISD::FLOG, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
- setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
- setOperationAction(ISD::FLOG2, MVT::f16, Promote);
- setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
- setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
- setOperationAction(ISD::FLOG10, MVT::f16, Promote);
- setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
- setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+ for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
+ ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
+ ISD::FEXP, ISD::FEXP2, ISD::FLOG,
+ ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM,
+ ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
+ ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
+ ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+ setOperationAction(Op, MVT::f16, Promote);
+ setOperationAction(Op, MVT::v4f16, Expand);
+ setOperationAction(Op, MVT::v8f16, Expand);
+ }
if (!Subtarget->hasFullFP16()) {
- setOperationAction(ISD::SELECT, MVT::f16, Promote);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
- setOperationAction(ISD::SETCC, MVT::f16, Promote);
- setOperationAction(ISD::BR_CC, MVT::f16, Promote);
- setOperationAction(ISD::FADD, MVT::f16, Promote);
- setOperationAction(ISD::FSUB, MVT::f16, Promote);
- setOperationAction(ISD::FMUL, MVT::f16, Promote);
- setOperationAction(ISD::FDIV, MVT::f16, Promote);
- setOperationAction(ISD::FMA, MVT::f16, Promote);
- setOperationAction(ISD::FNEG, MVT::f16, Promote);
- setOperationAction(ISD::FABS, MVT::f16, Promote);
- setOperationAction(ISD::FCEIL, MVT::f16, Promote);
- setOperationAction(ISD::FSQRT, MVT::f16, Promote);
- setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
- setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
- setOperationAction(ISD::FRINT, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::f16, Promote);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
- setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
- setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
+ for (auto Op :
+ {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
+ ISD::BR_CC, ISD::FADD, ISD::FSUB,
+ ISD::FMUL, ISD::FDIV, ISD::FMA,
+ ISD::FNEG, ISD::FABS, ISD::FCEIL,
+ ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
+ ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
+ ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
+ ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
+ ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
+ ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
+ ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
+ ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
+ ISD::STRICT_FMAXIMUM})
+ setOperationAction(Op, MVT::f16, Promote);
+
+ // Round-to-integer need custom lowering for fp16, as Promote doesn't work
+ // because the result type is integer.
+ for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
+ ISD::STRICT_LLRINT})
+ setOperationAction(Op, MVT::f16, Custom);
// promote v4f16 to v4f32 when that is known to be safe.
setOperationAction(ISD::FADD, MVT::v4f16, Promote);
@@ -691,37 +627,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
// AArch64 has implementations of a lot of rounding-like FP operations.
- for (MVT Ty : {MVT::f32, MVT::f64}) {
- setOperationAction(ISD::FFLOOR, Ty, Legal);
- setOperationAction(ISD::FNEARBYINT, Ty, Legal);
- setOperationAction(ISD::FCEIL, Ty, Legal);
- setOperationAction(ISD::FRINT, Ty, Legal);
- setOperationAction(ISD::FTRUNC, Ty, Legal);
- setOperationAction(ISD::FROUND, Ty, Legal);
- setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
- setOperationAction(ISD::FMINNUM, Ty, Legal);
- setOperationAction(ISD::FMAXNUM, Ty, Legal);
- setOperationAction(ISD::FMINIMUM, Ty, Legal);
- setOperationAction(ISD::FMAXIMUM, Ty, Legal);
- setOperationAction(ISD::LROUND, Ty, Legal);
- setOperationAction(ISD::LLROUND, Ty, Legal);
- setOperationAction(ISD::LRINT, Ty, Legal);
- setOperationAction(ISD::LLRINT, Ty, Legal);
- }
-
- if (Subtarget->hasFullFP16()) {
- setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
- setOperationAction(ISD::FCEIL, MVT::f16, Legal);
- setOperationAction(ISD::FRINT, MVT::f16, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
- setOperationAction(ISD::FROUND, MVT::f16, Legal);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
- setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
- }
+ for (auto Op :
+ {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
+ ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
+ ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
+ ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
+ ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
+ ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
+ ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
+ ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
+ ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
+ for (MVT Ty : {MVT::f32, MVT::f64})
+ setOperationAction(Op, Ty, Legal);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(Op, MVT::f16, Legal);
+ }
+
+ // Basic strict FP operations are legal
+ for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+ ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
+ for (MVT Ty : {MVT::f32, MVT::f64})
+ setOperationAction(Op, Ty, Legal);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(Op, MVT::f16, Legal);
+ }
+
+ // Strict conversion to a larger type is legal
+ for (auto VT : {MVT::f32, MVT::f64})
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
@@ -891,47 +825,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::ABS);
- setTargetDAGCombine(ISD::SUB);
- setTargetDAGCombine(ISD::XOR);
- setTargetDAGCombine(ISD::SINT_TO_FP);
- setTargetDAGCombine(ISD::UINT_TO_FP);
-
- setTargetDAGCombine(ISD::FP_TO_SINT);
- setTargetDAGCombine(ISD::FP_TO_UINT);
- setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
- setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
- setTargetDAGCombine(ISD::FDIV);
+ setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP});
+
+ setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+ ISD::FP_TO_UINT_SAT, ISD::FDIV});
// Try and combine setcc with csel
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
- setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::SIGN_EXTEND);
- setTargetDAGCombine(ISD::VECTOR_SPLICE);
- setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::TRUNCATE);
- setTargetDAGCombine(ISD::CONCAT_VECTORS);
- setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
- setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
+ ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
+ ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
+ ISD::INSERT_SUBVECTOR, ISD::STORE});
if (Subtarget->supportsAddressTopByteIgnored())
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::SELECT);
- setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
+
+ setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
+ ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
+ ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
- setTargetDAGCombine(ISD::INTRINSIC_VOID);
- setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::VECREDUCE_ADD);
- setTargetDAGCombine(ISD::STEP_VECTOR);
+ setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
setTargetDAGCombine(ISD::FP_EXTEND);
@@ -980,43 +900,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
- setOperationAction(ISD::FABS, MVT::v1f64, Expand);
- setOperationAction(ISD::FADD, MVT::v1f64, Expand);
- setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
- setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
- setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
- setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
- setOperationAction(ISD::FMA, MVT::v1f64, Expand);
- setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
- setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
- setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
- setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
- setOperationAction(ISD::FREM, MVT::v1f64, Expand);
- setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
- setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
- setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
- setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
- setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
- setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
- setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
- setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
- setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
- setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
- setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
- setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
- setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
- setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
- setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand);
- setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand);
-
- setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+ for (auto Op :
+ {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
+ ISD::BR_CC, ISD::FADD, ISD::FSUB,
+ ISD::FMUL, ISD::FDIV, ISD::FMA,
+ ISD::FNEG, ISD::FABS, ISD::FCEIL,
+ ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
+ ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
+ ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
+ ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
+ ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
+ ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
+ ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
+ ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
+ ISD::STRICT_FMAXIMUM})
+ setOperationAction(Op, MVT::v1f64, Expand);
+
+ for (auto Op :
+ {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
+ ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
+ ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
+ ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
+ setOperationAction(Op, MVT::v1i64, Expand);
// AArch64 doesn't have a direct vector ->f32 conversion instructions for
// elements smaller than i32, so promote the input to i32 first.
@@ -1024,14 +930,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP})
+ for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
+ setOperationAction(Op, VT, Custom);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
@@ -1088,6 +992,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
MVT::v4i32}) {
+ setOperationAction(ISD::AVGFLOORS, VT, Legal);
+ setOperationAction(ISD::AVGFLOORU, VT, Legal);
+ setOperationAction(ISD::AVGCEILS, VT, Legal);
+ setOperationAction(ISD::AVGCEILU, VT, Legal);
setOperationAction(ISD::ABDS, VT, Legal);
setOperationAction(ISD::ABDU, VT, Legal);
}
@@ -1141,31 +1049,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
// AArch64 has implementations of a lot of rounding-like FP operations.
- for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
- setOperationAction(ISD::FFLOOR, Ty, Legal);
- setOperationAction(ISD::FNEARBYINT, Ty, Legal);
- setOperationAction(ISD::FCEIL, Ty, Legal);
- setOperationAction(ISD::FRINT, Ty, Legal);
- setOperationAction(ISD::FTRUNC, Ty, Legal);
- setOperationAction(ISD::FROUND, Ty, Legal);
- setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
- }
-
- if (Subtarget->hasFullFP16()) {
- for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
- setOperationAction(ISD::FFLOOR, Ty, Legal);
- setOperationAction(ISD::FNEARBYINT, Ty, Legal);
- setOperationAction(ISD::FCEIL, Ty, Legal);
- setOperationAction(ISD::FRINT, Ty, Legal);
- setOperationAction(ISD::FTRUNC, Ty, Legal);
- setOperationAction(ISD::FROUND, Ty, Legal);
- setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
- }
+ for (auto Op :
+ {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
+ ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
+ ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
+ ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
+ for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
+ setOperationAction(Op, Ty, Legal);
+ if (Subtarget->hasFullFP16())
+ for (MVT Ty : {MVT::v4f16, MVT::v8f16})
+ setOperationAction(Op, Ty, Legal);
}
- if (Subtarget->hasSVE())
- setOperationAction(ISD::VSCALE, MVT::i32, Custom);
-
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
@@ -1174,6 +1069,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+
+ // ADDP custom lowering
+ for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::ADD, VT, Custom);
+ // FADDP custom lowering
+ for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::FADD, VT, Custom);
+ }
+
+ if (Subtarget->hasSME()) {
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
}
if (Subtarget->hasSVE()) {
@@ -1194,7 +1100,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
setOperationAction(ISD::MULHU, VT, Custom);
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
@@ -1224,6 +1130,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
+
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
}
// Illegal unpacked integer vector types.
@@ -1234,10 +1149,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Legalize unpacked bitcasts to REINTERPRET_CAST.
for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
- MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
+ MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
setOperationAction(ISD::BITCAST, VT, Custom);
- for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+ for (auto VT :
+ { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
+ MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
+
+ for (auto VT :
+ {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
@@ -1269,18 +1190,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MSCATTER, VT, Custom);
}
- for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
- for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
- // Avoid marking truncating FP stores as legal to prevent the
- // DAGCombiner from creating unsupported truncating stores.
+ // Firstly, exclude all scalable vector extending loads/truncating stores,
+ // include both integer and floating scalable vector.
+ for (MVT VT : MVT::scalable_vector_valuetypes()) {
+ for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
- // SVE does not have floating-point extending loads.
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
}
+ // Then, selectively enable those which we directly support.
+ setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
+ setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
+ setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
+ setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
+ setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
+ setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
+ for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
+ setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
+ setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
+ setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
+ setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
+ setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
+ }
+
// SVE supports truncating stores of 64 and 128-bit vectors
setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
@@ -1295,7 +1231,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
@@ -1326,6 +1262,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FPOWI, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+
+ setCondCodeAction(ISD::SETO, VT, Expand);
+ setCondCodeAction(ISD::SETOLT, VT, Expand);
+ setCondCodeAction(ISD::SETLT, VT, Expand);
+ setCondCodeAction(ISD::SETOLE, VT, Expand);
+ setCondCodeAction(ISD::SETLE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETULE, VT, Expand);
+ setCondCodeAction(ISD::SETUGE, VT, Expand);
+ setCondCodeAction(ISD::SETUGT, VT, Expand);
+ setCondCodeAction(ISD::SETUEQ, VT, Expand);
+ setCondCodeAction(ISD::SETONE, VT, Expand);
}
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
@@ -1334,13 +1293,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
}
- setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
-
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+ // NEON doesn't support integer divides, but SVE does
+ for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+ MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ }
+
+ // NEON doesn't support 64-bit vector integer muls, but SVE does.
+ setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1367,32 +1336,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
- setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
- setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
- setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
- setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
- setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
- setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
- setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
- setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
- setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
- setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
- setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
- setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
- setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
- setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
- setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
- setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
@@ -1426,6 +1377,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
+
+ setOperationAction(ISD::VSCALE, MVT::i32, Custom);
}
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -1434,6 +1387,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+
+ IsStrictFPEnabled = true;
}
void AArch64TargetLowering::addTypeForNEON(MVT VT) {
@@ -1490,10 +1445,10 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FP_TO_SINT, VT, Custom);
- setOperationAction(ISD::FP_TO_UINT, VT, Custom);
- setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
- setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+ for (unsigned Opcode :
+ {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+ ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+ setOperationAction(Opcode, VT, Custom);
if (!VT.isFloatingPoint())
setOperationAction(ISD::ABS, VT, Legal);
@@ -1503,14 +1458,39 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
- // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
+ // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
+ // NEON types.
if (VT.isFloatingPoint() &&
VT.getVectorElementType() != MVT::bf16 &&
(VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
for (unsigned Opcode :
- {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
+ {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
+ ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
+ ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
+ ISD::STRICT_FSQRT})
setOperationAction(Opcode, VT, Legal);
+ // Strict fp extend and trunc are legal
+ if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
+ if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
+
+ // FIXME: We could potentially make use of the vector comparison instructions
+ // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
+ // complications:
+ // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
+ // so we would need to expand when the condition code doesn't match the
+ // kind of comparison.
+ // * Some kinds of comparison require more than one FCMXY instruction so
+ // would need to be expanded instead.
+ // * The lowering of the non-strict versions involves target-specific ISD
+ // nodes so we would likely need to add strict versions of all of them and
+ // handle them appropriately.
+ setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
+
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -1526,9 +1506,11 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
if (!Subtarget->hasSVE())
return true;
- // We can only support legal predicate result types.
+ // We can only support legal predicate result types. We can use the SVE
+ // whilelo instruction for generating fixed-width predicates too.
if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
- ResVT != MVT::nxv16i1)
+ ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
+ ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
return true;
// The whilelo instruction only works with i32 or i64 scalar inputs.
@@ -1559,7 +1541,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setCondCodeAction(ISD::SETUGE, VT, Expand);
setCondCodeAction(ISD::SETUGT, VT, Expand);
setCondCodeAction(ISD::SETUEQ, VT, Expand);
- setCondCodeAction(ISD::SETUNE, VT, Expand);
+ setCondCodeAction(ISD::SETONE, VT, Expand);
}
// Mark integer truncating stores/extending loads as having custom lowering
@@ -1830,11 +1812,21 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
/// Mask are known to be either zero or one and return them Known.
void AArch64TargetLowering::computeKnownBitsForTargetNode(
- const SDValue Op, KnownBits &Known,
- const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
+ const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth) const {
switch (Op.getOpcode()) {
default:
break;
+ case AArch64ISD::DUP: {
+ SDValue SrcOp = Op.getOperand(0);
+ Known = DAG.computeKnownBits(SrcOp, Depth + 1);
+ if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
+ assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
+ "Expected DUP implicit truncation");
+ Known = Known.trunc(Op.getScalarValueSizeInBits());
+ }
+ break;
+ }
case AArch64ISD::CSEL: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
@@ -2006,7 +1998,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
MAKE_CASE(AArch64ISD::ABDS_PRED)
MAKE_CASE(AArch64ISD::ABDU_PRED)
- MAKE_CASE(AArch64ISD::ADD_PRED)
MAKE_CASE(AArch64ISD::MUL_PRED)
MAKE_CASE(AArch64ISD::MULHS_PRED)
MAKE_CASE(AArch64ISD::MULHU_PRED)
@@ -2016,7 +2007,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::SMIN_PRED)
MAKE_CASE(AArch64ISD::SRA_PRED)
MAKE_CASE(AArch64ISD::SRL_PRED)
- MAKE_CASE(AArch64ISD::SUB_PRED)
MAKE_CASE(AArch64ISD::UDIV_PRED)
MAKE_CASE(AArch64ISD::UMAX_PRED)
MAKE_CASE(AArch64ISD::UMIN_PRED)
@@ -2061,6 +2051,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::DUPLANE16)
MAKE_CASE(AArch64ISD::DUPLANE32)
MAKE_CASE(AArch64ISD::DUPLANE64)
+ MAKE_CASE(AArch64ISD::DUPLANE128)
MAKE_CASE(AArch64ISD::MOVI)
MAKE_CASE(AArch64ISD::MOVIshift)
MAKE_CASE(AArch64ISD::MOVIedit)
@@ -2108,10 +2099,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FCMLTz)
MAKE_CASE(AArch64ISD::SADDV)
MAKE_CASE(AArch64ISD::UADDV)
- MAKE_CASE(AArch64ISD::SRHADD)
- MAKE_CASE(AArch64ISD::URHADD)
- MAKE_CASE(AArch64ISD::SHADD)
- MAKE_CASE(AArch64ISD::UHADD)
MAKE_CASE(AArch64ISD::SDOT)
MAKE_CASE(AArch64ISD::UDOT)
MAKE_CASE(AArch64ISD::SMINV)
@@ -2150,6 +2137,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FMINNMV_PRED)
MAKE_CASE(AArch64ISD::FMUL_PRED)
MAKE_CASE(AArch64ISD::FSUB_PRED)
+ MAKE_CASE(AArch64ISD::RDSVL)
MAKE_CASE(AArch64ISD::BIC)
MAKE_CASE(AArch64ISD::BIT)
MAKE_CASE(AArch64ISD::CBZ)
@@ -2267,10 +2255,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+ MAKE_CASE(AArch64ISD::ADDP)
+ MAKE_CASE(AArch64ISD::SADDLP)
MAKE_CASE(AArch64ISD::UADDLP)
MAKE_CASE(AArch64ISD::CALL_RVMARKER)
MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
@@ -2278,6 +2269,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
+ MAKE_CASE(AArch64ISD::CALL_BTI)
}
#undef MAKE_CASE
return nullptr;
@@ -2351,6 +2343,92 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.add(MI.getOperand(1)); // slice index register
+ MIB.add(MI.getOperand(2)); // slice index offset
+ MIB.add(MI.getOperand(3)); // pg
+ MIB.add(MI.getOperand(4)); // base
+ MIB.add(MI.getOperand(5)); // offset
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
+
+ MIB.addReg(AArch64::ZA, RegState::Define);
+ MIB.add(MI.getOperand(0)); // Vector select register
+ MIB.add(MI.getOperand(1)); // Vector select offset
+ MIB.add(MI.getOperand(2)); // Base
+ MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+ MIB.add(MI.getOperand(1)); // pn
+ MIB.add(MI.getOperand(2)); // pm
+ MIB.add(MI.getOperand(3)); // zn
+ MIB.add(MI.getOperand(4)); // zm
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+ MIB.add(MI.getOperand(1)); // Slice index register
+ MIB.add(MI.getOperand(2)); // Slice index offset
+ MIB.add(MI.getOperand(3)); // pg
+ MIB.add(MI.getOperand(4)); // zn
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
+ MIB.add(MI.getOperand(0)); // Mask
+
+ unsigned Mask = MI.getOperand(0).getImm();
+ for (unsigned I = 0; I < 8; I++) {
+ if (Mask & (1 << I))
+ MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
+ }
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@@ -2366,9 +2444,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case TargetOpcode::STATEPOINT:
// STATEPOINT is a pseudo instruction which has no implicit defs/uses
// while bl call instruction (where statepoint will be lowered at the end)
- // has implicit def. Add this implicit dead def here as a workaround.
- MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
- true, false, true));
+ // has implicit def. This def is early-clobber as it will be set at
+ // the moment of the call and earlier than any use is read.
+ // Add this implicit dead def here as a workaround.
+ MI.addOperand(*MI.getMF(),
+ MachineOperand::CreateReg(
+ AArch64::LR, /*isDef*/ true,
+ /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
+ /*isUndef*/ false, /*isEarlyClobber*/ true));
LLVM_FALLTHROUGH;
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
@@ -2376,6 +2459,108 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_B:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_H:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_S:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_D:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_B:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_H:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_S:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_D:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
+ case AArch64::LDR_ZA_PSEUDO:
+ return EmitFill(MI, BB);
+ case AArch64::BFMOPA_MPPZZ_PSEUDO:
+ return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
+ case AArch64::BFMOPS_MPPZZ_PSEUDO:
+ return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
+ case AArch64::FMOPAL_MPPZZ_PSEUDO:
+ return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
+ case AArch64::FMOPSL_MPPZZ_PSEUDO:
+ return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
+ case AArch64::FMOPA_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::FMOPS_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::FMOPA_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::FMOPS_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::SMOPA_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::SMOPS_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::UMOPA_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::UMOPS_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::USMOPA_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::USMOPS_MPPZZ_S_PSEUDO:
+ return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::SMOPA_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::SMOPS_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::UMOPA_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::UMOPS_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::USMOPA_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::USMOPS_MPPZZ_D_PSEUDO:
+ return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
+ BB);
+ case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
+ return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
+ BB);
+ case AArch64::ZERO_M_PSEUDO:
+ return EmitZero(MI, BB);
}
}
@@ -2596,7 +2781,17 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
bool IsSignaling) {
EVT VT = LHS.getValueType();
assert(VT != MVT::f128);
- assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
+
+ const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
+
+ if (VT == MVT::f16 && !FullFP16) {
+ LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+ {Chain, LHS});
+ RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+ {LHS.getValue(1), RHS});
+ Chain = RHS.getValue(1);
+ VT = MVT::f32;
+ }
unsigned Opcode =
IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
@@ -2605,8 +2800,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
- const bool FullFP16 =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+ const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
if (VT.isFloatingPoint()) {
assert(VT != MVT::f128);
@@ -2714,8 +2908,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode OutCC,
const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
- const bool FullFP16 =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+ const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
if (LHS.getValueType().isFloatingPoint()) {
assert(LHS.getValueType() != MVT::f128);
@@ -3282,40 +3475,68 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
+// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
+// sets 'C' bit to 0.
+static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
+ SDLoc DL(Value);
+ EVT VT = Value.getValueType();
+ SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
+ SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+ return Cmp.getValue(1);
+}
- // Let legalize expand this if it isn't a legal type yet.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
+// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
+static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
+ bool Invert) {
+ assert(Flag.getResNo() == 1);
+ SDLoc DL(Flag);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
+ SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
+}
+
+// Value is 1 if 'V' bit of NZCV is 1, else 0
+static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
+ assert(Flag.getResNo() == 1);
+ SDLoc DL(Flag);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
+}
+
+// This lowering is inefficient, but it will get cleaned up by
+// `foldOverflowCheck`
+static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
+ bool IsSigned) {
+ EVT VT0 = Op.getValue(0).getValueType();
+ EVT VT1 = Op.getValue(1).getValueType();
+
+ if (VT0 != MVT::i32 && VT0 != MVT::i64)
return SDValue();
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ bool InvertCarry = Opcode == AArch64ISD::SBCS;
+ SDValue OpLHS = Op.getOperand(0);
+ SDValue OpRHS = Op.getOperand(1);
+ SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
- unsigned Opc;
- bool ExtraOp = false;
- switch (Op.getOpcode()) {
- default:
- llvm_unreachable("Invalid code");
- case ISD::ADDC:
- Opc = AArch64ISD::ADDS;
- break;
- case ISD::SUBC:
- Opc = AArch64ISD::SUBS;
- break;
- case ISD::ADDE:
- Opc = AArch64ISD::ADCS;
- ExtraOp = true;
- break;
- case ISD::SUBE:
- Opc = AArch64ISD::SBCS;
- ExtraOp = true;
- break;
- }
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(VT0, VT1);
+
+ SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+ OpRHS, OpCarryIn);
- if (!ExtraOp)
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
- Op.getOperand(2));
+ SDValue OutFlag =
+ IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
+ : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -3417,7 +3638,8 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
- EVT InVT = Op.getOperand(0).getValueType();
+ bool IsStrict = Op->isStrictFPOpcode();
+ EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
EVT VT = Op.getValueType();
if (VT.isScalableVector()) {
@@ -3437,6 +3659,12 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
!Subtarget->hasFullFP16()) {
MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
SDLoc dl(Op);
+ if (IsStrict) {
+ SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
@@ -3446,6 +3674,13 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
SDLoc dl(Op);
+ if (IsStrict) {
+ InVT = InVT.changeVectorElementTypeToInteger();
+ SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+ return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
+ }
SDValue Cv =
DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
Op.getOperand(0));
@@ -3457,10 +3692,30 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
MVT ExtVT =
MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
VT.getVectorNumElements());
+ if (IsStrict) {
+ SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
}
+ // Use a scalar operation for conversions between single-element vectors of
+ // the same size.
+ if (NumElts == 1) {
+ SDLoc dl(Op);
+ SDValue Extract = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+ Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
+ EVT ScalarVT = VT.getScalarType();
+ if (IsStrict)
+ return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+ {Op.getOperand(0), Extract});
+ return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+ }
+
// Type changing conversions are illegal.
return Op;
}
@@ -3475,8 +3730,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
- assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
+ if (IsStrict) {
+ SDValue Ext =
+ DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), SrcVal});
+ return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
@@ -3507,7 +3768,7 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
"Saturation width cannot exceed result width");
// TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
- // Currently, the `llvm.fpto[su]i.sat.*` instrinsics don't accept scalable
+ // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
// types, so this is hard to reach.
if (DstVT.isScalableVector())
return SDValue();
@@ -3545,17 +3806,14 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
SDValue Sat;
if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
SDValue MinC = DAG.getConstant(
- APInt::getSignedMaxValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
- IntVT);
+ APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
SDValue MaxC = DAG.getConstant(
- APInt::getSignedMinValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
- IntVT);
+ APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
} else {
SDValue MinC = DAG.getConstant(
- APInt::getAllOnesValue(SatWidth).zextOrSelf(SrcElementWidth), DL,
- IntVT);
+ APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
}
@@ -3604,14 +3862,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
SDValue Sat;
if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
SDValue MinC = DAG.getConstant(
- APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+ APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
SDValue MaxC = DAG.getConstant(
- APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+ APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
} else {
SDValue MinC = DAG.getConstant(
- APInt::getAllOnesValue(SatWidth).zextOrSelf(DstWidth), DL, DstVT);
+ APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
}
@@ -3623,9 +3881,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
+ bool IsStrict = Op->isStrictFPOpcode();
EVT VT = Op.getValueType();
SDLoc dl(Op);
- SDValue In = Op.getOperand(0);
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
EVT InVT = In.getValueType();
unsigned Opc = Op.getOpcode();
bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
@@ -3653,6 +3912,13 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
MVT CastVT =
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
InVT.getVectorNumElements());
+ if (IsStrict) {
+ In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
+ {Op.getOperand(0), In});
+ return DAG.getNode(
+ ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+ {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
+ }
In = DAG.getNode(Opc, dl, CastVT, In);
return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
}
@@ -3661,9 +3927,24 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
EVT CastVT = VT.changeVectorElementTypeToInteger();
In = DAG.getNode(CastOpc, dl, CastVT, In);
+ if (IsStrict)
+ return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
return DAG.getNode(Opc, dl, VT, In);
}
+ // Use a scalar operation for conversions between single-element vectors of
+ // the same size.
+ if (VT.getVectorNumElements() == 1) {
+ SDValue Extract = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+ In, DAG.getConstant(0, dl, MVT::i64));
+ EVT ScalarVT = VT.getScalarType();
+ if (IsStrict)
+ return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+ {Op.getOperand(0), Extract});
+ return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+ }
+
return Op;
}
@@ -3676,10 +3957,15 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
// f16 conversions are promoted to f32 when full fp16 is not supported.
- if (Op.getValueType() == MVT::f16 &&
- !Subtarget->hasFullFP16()) {
- assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
+ if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
SDLoc dl(Op);
+ if (IsStrict) {
+ SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), SrcVal});
+ return DAG.getNode(
+ ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
+ {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
+ }
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
@@ -3742,6 +4028,14 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
return LowerFixedLengthBitcastToSVE(Op, DAG);
if (OpVT.isScalableVector()) {
+ // Bitcasting between unpacked vector types of different element counts is
+ // not a NOP because the live elements are laid out differently.
+ // 01234567
+ // e.g. nxv2i32 = XX??XX??
+ // nxv4f16 = X?X?X?X?
+ if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
+ return SDValue();
+
if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
"Expected int->fp bitcast!");
@@ -3964,7 +4258,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
@@ -4059,10 +4353,26 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
case AArch64ISD::SETCC_MERGE_ZERO:
return Reinterpret;
case ISD::INTRINSIC_WO_CHAIN:
- if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
+ switch (InOp.getConstantOperandVal(0)) {
+ case Intrinsic::aarch64_sve_ptrue:
+ case Intrinsic::aarch64_sve_cmpeq_wide:
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ case Intrinsic::aarch64_sve_cmpge_wide:
+ case Intrinsic::aarch64_sve_cmpgt_wide:
+ case Intrinsic::aarch64_sve_cmplt_wide:
+ case Intrinsic::aarch64_sve_cmple_wide:
+ case Intrinsic::aarch64_sve_cmphs_wide:
+ case Intrinsic::aarch64_sve_cmphi_wide:
+ case Intrinsic::aarch64_sve_cmplo_wide:
+ case Intrinsic::aarch64_sve_cmpls_wide:
return Reinterpret;
+ }
}
+ // Splat vectors of one will generate ptrue instructions
+ if (ISD::isConstantSplatVectorAllOnes(InOp.getNode()))
+ return Reinterpret;
+
// Otherwise, zero the newly introduced lanes.
SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
SDValue MaskReinterpret =
@@ -4073,12 +4383,12 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = Op.getConstantOperandVal(1);
+ SDLoc DL(Op);
switch (IntNo) {
default:
return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::aarch64_mops_memset_tag: {
auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
- SDLoc DL(Op);
SDValue Chain = Node->getChain();
SDValue Dst = Op.getOperand(2);
SDValue Val = Op.getOperand(3);
@@ -4100,6 +4410,15 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
// changed.
return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
}
+ case Intrinsic::aarch64_sme_get_pstatesm: {
+ SDValue Chain = Op.getOperand(0);
+ SDValue MRS = DAG.getNode(
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
+ Chain, DAG.getConstant(AArch64SysReg::SVCR, DL, MVT::i64));
+ SDValue Mask = DAG.getConstant(/* PSTATE.SM */ 1, DL, MVT::i64);
+ SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, MRS, Mask);
+ return DAG.getMergeValues({And, Chain}, DL);
+ }
}
}
@@ -4196,6 +4515,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_clz:
return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sme_cntsb:
+ return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+ DAG.getConstant(1, dl, MVT::i32));
+ case Intrinsic::aarch64_sme_cntsh: {
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
+ return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
+ }
+ case Intrinsic::aarch64_sme_cntsw: {
+ SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+ DAG.getConstant(1, dl, MVT::i32));
+ return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
+ DAG.getConstant(2, dl, MVT::i32));
+ }
+ case Intrinsic::aarch64_sme_cntsd: {
+ SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
+ DAG.getConstant(1, dl, MVT::i32));
+ return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
+ DAG.getConstant(3, dl, MVT::i32));
+ }
case Intrinsic::aarch64_sve_cnt: {
SDValue Data = Op.getOperand(3);
// CTPOP only supports integer operands.
@@ -4300,6 +4639,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_revw:
return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_revd:
+ return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case Intrinsic::aarch64_sve_sxtb:
return DAG.getNode(
AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
@@ -4336,7 +4678,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(2), Op.getOperand(3),
DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
Op.getOperand(1));
-
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -4382,9 +4723,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
IntNo == Intrinsic::aarch64_neon_shadd);
bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
IntNo == Intrinsic::aarch64_neon_urhadd);
- unsigned Opcode =
- IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
- : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
+ unsigned Opcode = IsSignedAdd
+ ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
+ : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
@@ -4395,8 +4736,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
+ case Intrinsic::aarch64_neon_saddlp:
case Intrinsic::aarch64_neon_uaddlp: {
- unsigned Opcode = AArch64ISD::UADDLP;
+ unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
+ ? AArch64ISD::UADDLP
+ : AArch64ISD::SADDLP;
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
}
case Intrinsic::aarch64_neon_sdot:
@@ -4428,19 +4772,26 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
return false;
}
-bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
- if (VT.getVectorElementType() == MVT::i32 &&
- VT.getVectorElementCount().getKnownMinValue() >= 4 &&
- !VT.isFixedLengthVector())
- return true;
+bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
+ EVT DataVT) const {
+ // SVE only supports implicit extension of 32-bit indices.
+ if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
+ return false;
- return false;
+ // Indices cannot be smaller than the main data type.
+ if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
+ return false;
+
+ // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
+ // element container type, which would violate the previous clause.
+ return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
}
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector() ||
- useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
- /*OverrideNEON=*/true);
+ useSVEForFixedLengthVectorVT(
+ ExtVal.getValueType(),
+ /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -4466,29 +4817,6 @@ unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
return AddrModes.find(Key)->second;
}
-unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
- std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
- {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
- AArch64ISD::SST1_PRED},
- {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
- AArch64ISD::SST1_UXTW_PRED},
- {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
- AArch64ISD::SST1_PRED},
- {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
- AArch64ISD::SST1_SXTW_PRED},
- {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
- AArch64ISD::SST1_SCALED_PRED},
- {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
- AArch64ISD::SST1_UXTW_SCALED_PRED},
- {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
- AArch64ISD::SST1_SCALED_PRED},
- {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
- AArch64ISD::SST1_SXTW_SCALED_PRED},
- };
- auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
- return AddrModes.find(Key)->second;
-}
-
unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
switch (Opcode) {
default:
@@ -4511,267 +4839,184 @@ unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
}
}
-bool getGatherScatterIndexIsExtended(SDValue Index) {
- unsigned Opcode = Index.getOpcode();
- if (Opcode == ISD::SIGN_EXTEND_INREG)
- return true;
-
- if (Opcode == ISD::AND) {
- SDValue Splat = Index.getOperand(1);
- if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
- return false;
- ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
- if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
- return false;
- return true;
- }
-
- return false;
-}
-
-// If the base pointer of a masked gather or scatter is null, we
-// may be able to swap BasePtr & Index and use the vector + register
-// or vector + immediate addressing mode, e.g.
-// VECTOR + REGISTER:
-// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
-// -> getelementptr %offset, <vscale x N x T> %indices
-// VECTOR + IMMEDIATE:
-// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
-// -> getelementptr #x, <vscale x N x T> %indices
-void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
- unsigned &Opcode, bool IsGather,
- SelectionDAG &DAG) {
- if (!isNullConstant(BasePtr))
- return;
-
- // FIXME: This will not match for fixed vector type codegen as the nodes in
- // question will have fixed<->scalable conversions around them. This should be
- // moved to a DAG combine or complex pattern so that is executes after all of
- // the fixed vector insert and extracts have been removed. This deficiency
- // will result in a sub-optimal addressing mode being used, i.e. an ADD not
- // being folded into the scatter/gather.
- ConstantSDNode *Offset = nullptr;
- if (Index.getOpcode() == ISD::ADD)
- if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
- if (isa<ConstantSDNode>(SplatVal))
- Offset = cast<ConstantSDNode>(SplatVal);
- else {
- BasePtr = SplatVal;
- Index = Index->getOperand(0);
- return;
- }
- }
-
- unsigned NewOp =
- IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
-
- if (!Offset) {
- std::swap(BasePtr, Index);
- Opcode = NewOp;
- return;
- }
-
- uint64_t OffsetVal = Offset->getZExtValue();
- unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
- auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
-
- if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
- // Index is out of range for the immediate addressing mode
- BasePtr = ConstOffset;
- Index = Index->getOperand(0);
- return;
- }
-
- // Immediate is in range
- Opcode = NewOp;
- BasePtr = Index->getOperand(0);
- Index = ConstOffset;
-}
-
SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc DL(Op);
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
- assert(MGT && "Can only custom lower gather load nodes");
-
- bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
- SDValue Index = MGT->getIndex();
+ SDLoc DL(Op);
SDValue Chain = MGT->getChain();
SDValue PassThru = MGT->getPassThru();
SDValue Mask = MGT->getMask();
SDValue BasePtr = MGT->getBasePtr();
- ISD::LoadExtType ExtTy = MGT->getExtensionType();
-
- ISD::MemIndexType IndexType = MGT->getIndexType();
- bool IsScaled =
- IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
- bool IsSigned =
- IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
- bool IdxNeedsExtend =
- getGatherScatterIndexIsExtended(Index) ||
- Index.getSimpleValueType().getVectorElementType() == MVT::i32;
- bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
-
- EVT VT = PassThru.getSimpleValueType();
- EVT IndexVT = Index.getSimpleValueType();
+ SDValue Index = MGT->getIndex();
+ SDValue Scale = MGT->getScale();
+ EVT VT = Op.getValueType();
EVT MemVT = MGT->getMemoryVT();
- SDValue InputVT = DAG.getValueType(MemVT);
-
- if (VT.getVectorElementType() == MVT::bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
+ ISD::LoadExtType ExtType = MGT->getExtensionType();
+ ISD::MemIndexType IndexType = MGT->getIndexType();
- if (IsFixedLength) {
+ // SVE supports zero (and so undef) passthrough values only, everything else
+ // must be handled manually by an explicit select on the load's output.
+ if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
+ SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
+ SDValue Load =
+ DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
+ MGT->getMemOperand(), IndexType, ExtType);
+ SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
+ return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
+ }
+
+ bool IsScaled = MGT->isIndexScaled();
+ bool IsSigned = MGT->isIndexSigned();
+
+ // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
+ // must be calculated before hand.
+ uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+ if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
+ assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
+ EVT IndexVT = Index.getValueType();
+ Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
+ DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
+ Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
+
+ SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
+ MGT->getMemOperand(), IndexType, ExtType);
+ }
+
+ // Lower fixed length gather to a scalable equivalent.
+ if (VT.isFixedLengthVector()) {
assert(Subtarget->useSVEForFixedLengthVectors() &&
- "Cannot lower when not using SVE for fixed vectors");
- if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
- IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
- MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
- } else {
- MemVT = getContainerForFixedLengthVector(DAG, MemVT);
- IndexVT = MemVT.changeTypeToInteger();
- }
- InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
- Mask = DAG.getNode(
- ISD::SIGN_EXTEND, DL,
- VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
- }
-
- if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
- PassThru = SDValue();
-
- if (VT.isFloatingPoint() && !IsFixedLength) {
- // Handle FP data by using an integer gather and casting the result.
- if (PassThru) {
- EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
- PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
- }
- InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
- }
-
- SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
-
- if (getGatherScatterIndexIsExtended(Index))
- Index = Index.getOperand(0);
-
- unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
- selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
- /*isGather=*/true, DAG);
-
- if (ResNeedsSignExtend)
- Opcode = getSignExtendedGatherOpcode(Opcode);
-
- if (IsFixedLength) {
- if (Index.getSimpleValueType().isFixedLengthVector())
- Index = convertToScalableVector(DAG, IndexVT, Index);
- if (BasePtr.getSimpleValueType().isFixedLengthVector())
- BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
+ "Cannot lower when not using SVE for fixed vectors!");
+
+ // NOTE: Handle floating-point as if integer then bitcast the result.
+ EVT DataVT = VT.changeVectorElementTypeToInteger();
+ MemVT = MemVT.changeVectorElementTypeToInteger();
+
+ // Find the smallest integer fixed length vector we can use for the gather.
+ EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
+ if (DataVT.getVectorElementType() == MVT::i64 ||
+ Index.getValueType().getVectorElementType() == MVT::i64 ||
+ Mask.getValueType().getVectorElementType() == MVT::i64)
+ PromotedVT = VT.changeVectorElementType(MVT::i64);
+
+ // Promote vector operands except for passthrough, which we know is either
+ // undef or zero, and thus best constructed directly.
+ unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
+
+ // A promoted result type forces the need for an extending load.
+ if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
+ ExtType = ISD::EXTLOAD;
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
+
+ // Convert fixed length vector operands to scalable.
+ MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
+ Index = convertToScalableVector(DAG, ContainerVT, Index);
Mask = convertFixedMaskToScalableVector(Mask, DAG);
- }
-
- SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
- SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
- Chain = Result.getValue(1);
-
- if (IsFixedLength) {
- Result = convertFromScalableVector(
- DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
- Result);
- Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
- Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-
- if (PassThru)
- Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
- } else {
- if (PassThru)
- Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
-
+ PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
+ : DAG.getConstant(0, DL, ContainerVT);
+
+ // Emit equivalent scalable vector gather.
+ SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+ SDValue Load =
+ DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
+ Ops, MGT->getMemOperand(), IndexType, ExtType);
+
+ // Extract fixed length data then convert to the required result type.
+ SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
+ Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
if (VT.isFloatingPoint())
- Result = getSVESafeBitCast(VT, Result, DAG);
+ Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+
+ return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
}
- return DAG.getMergeValues({Result, Chain}, DL);
+ // Everything else is legal.
+ return Op;
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc DL(Op);
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
- assert(MSC && "Can only custom lower scatter store nodes");
- bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
-
- SDValue Index = MSC->getIndex();
+ SDLoc DL(Op);
SDValue Chain = MSC->getChain();
SDValue StoreVal = MSC->getValue();
SDValue Mask = MSC->getMask();
SDValue BasePtr = MSC->getBasePtr();
-
- ISD::MemIndexType IndexType = MSC->getIndexType();
- bool IsScaled =
- IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
- bool IsSigned =
- IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
- bool NeedsExtend =
- getGatherScatterIndexIsExtended(Index) ||
- Index.getSimpleValueType().getVectorElementType() == MVT::i32;
-
- EVT VT = StoreVal.getSimpleValueType();
- EVT IndexVT = Index.getSimpleValueType();
- SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Index = MSC->getIndex();
+ SDValue Scale = MSC->getScale();
+ EVT VT = StoreVal.getValueType();
EVT MemVT = MSC->getMemoryVT();
- SDValue InputVT = DAG.getValueType(MemVT);
+ ISD::MemIndexType IndexType = MSC->getIndexType();
+ bool Truncating = MSC->isTruncatingStore();
- if (VT.getVectorElementType() == MVT::bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
+ bool IsScaled = MSC->isIndexScaled();
+ bool IsSigned = MSC->isIndexSigned();
+
+ // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
+ // must be calculated before hand.
+ uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
+ if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
+ assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
+ EVT IndexVT = Index.getValueType();
+ Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
+ DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
+ Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
- if (IsFixedLength) {
+ SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
+ MSC->getMemOperand(), IndexType, Truncating);
+ }
+
+ // Lower fixed length scatter to a scalable equivalent.
+ if (VT.isFixedLengthVector()) {
assert(Subtarget->useSVEForFixedLengthVectors() &&
- "Cannot lower when not using SVE for fixed vectors");
- if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
- IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
- MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
- } else {
- MemVT = getContainerForFixedLengthVector(DAG, MemVT);
- IndexVT = MemVT.changeTypeToInteger();
+ "Cannot lower when not using SVE for fixed vectors!");
+
+ // Once bitcast we treat floating-point scatters as if integer.
+ if (VT.isFloatingPoint()) {
+ VT = VT.changeVectorElementTypeToInteger();
+ MemVT = MemVT.changeVectorElementTypeToInteger();
+ StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
}
- InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
-
- StoreVal =
- DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
- StoreVal = DAG.getNode(
- ISD::ANY_EXTEND, DL,
- VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
- StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
- Mask = DAG.getNode(
- ISD::SIGN_EXTEND, DL,
- VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
- } else if (VT.isFloatingPoint()) {
- // Handle FP data by casting the data so an integer scatter can be used.
- EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
- StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
- InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
- }
-
- if (getGatherScatterIndexIsExtended(Index))
- Index = Index.getOperand(0);
-
- unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
- selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
- /*isGather=*/false, DAG);
-
- if (IsFixedLength) {
- if (Index.getSimpleValueType().isFixedLengthVector())
- Index = convertToScalableVector(DAG, IndexVT, Index);
- if (BasePtr.getSimpleValueType().isFixedLengthVector())
- BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
+
+ // Find the smallest integer fixed length vector we can use for the scatter.
+ EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
+ if (VT.getVectorElementType() == MVT::i64 ||
+ Index.getValueType().getVectorElementType() == MVT::i64 ||
+ Mask.getValueType().getVectorElementType() == MVT::i64)
+ PromotedVT = VT.changeVectorElementType(MVT::i64);
+
+ // Promote vector operands.
+ unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
+ StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
+
+ // A promoted value type forces the need for a truncating store.
+ if (PromotedVT != VT)
+ Truncating = true;
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
+
+ // Convert fixed length vector operands to scalable.
+ MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
+ Index = convertToScalableVector(DAG, ContainerVT, Index);
Mask = convertFixedMaskToScalableVector(Mask, DAG);
+ StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
+
+ // Emit equivalent scalable vector scatter.
+ SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
+ MSC->getMemOperand(), IndexType, Truncating);
}
- SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
- return DAG.getNode(Opcode, DL, VTs, Ops);
+ // Everything else is legal.
+ return Op;
}
SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4780,7 +5025,9 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(LoadNode && "Expected custom lowering of a masked load node");
EVT VT = Op->getValueType(0);
- if (useSVEForFixedLengthVectorVT(VT, true))
+ if (useSVEForFixedLengthVectorVT(
+ VT,
+ /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
SDValue PassThru = LoadNode->getPassThru();
@@ -4847,7 +5094,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
EVT MemVT = StoreNode->getMemoryVT();
if (VT.isVector()) {
- if (useSVEForFixedLengthVectorVT(VT, true))
+ if (useSVEForFixedLengthVectorVT(
+ VT,
+ /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
return LowerFixedLengthVectorStoreToSVE(Op, DAG);
unsigned AS = StoreNode->getAddressSpace();
@@ -5007,6 +5256,22 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
Cmp.getValue(1));
}
+static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+
+ AArch64CC::CondCode CC;
+ if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
+ SDLoc dl(Op);
+ SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -5026,6 +5291,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS:
return LowerSETCC(Op, DAG);
+ case ISD::BRCOND:
+ return LowerBRCOND(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
case ISD::SELECT:
@@ -5046,11 +5313,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerVACOPY(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::SUBC:
- case ISD::SUBE:
- return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADDCARRY:
+ return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
+ case ISD::SUBCARRY:
+ return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
+ case ISD::SADDO_CARRY:
+ return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
+ case ISD::SSUBO_CARRY:
+ return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
@@ -5165,11 +5435,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::MUL:
return LowerMUL(Op, DAG);
case ISD::MULHS:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
case ISD::MULHU:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
case ISD::INTRINSIC_W_CHAIN:
return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
@@ -5234,11 +5502,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFixedLengthVectorLoadToSVE(Op, DAG);
return LowerLOAD(Op, DAG);
case ISD::ADD:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
case ISD::AND:
- return LowerToScalableOp(Op, DAG);
case ISD::SUB:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+ return LowerToScalableOp(Op, DAG);
case ISD::FMAXIMUM:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
case ISD::FMAXNUM:
@@ -5260,12 +5526,23 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::BSWAP:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
case ISD::CTLZ:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
case ISD::CTTZ:
return LowerCTTZ(Op, DAG);
case ISD::VECTOR_SPLICE:
return LowerVECTOR_SPLICE(Op, DAG);
+ case ISD::STRICT_LROUND:
+ case ISD::STRICT_LLROUND:
+ case ISD::STRICT_LRINT:
+ case ISD::STRICT_LLRINT: {
+ assert(Op.getOperand(1).getValueType() == MVT::f16 &&
+ "Expected custom lowering of rounding operations only for f16");
+ SDLoc DL(Op);
+ SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1)});
+ return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
+ {Ext.getValue(1), Ext.getValue(0)});
+ }
}
}
@@ -5275,10 +5552,7 @@ bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
EVT VT, bool OverrideNEON) const {
- if (!Subtarget->useSVEForFixedLengthVectors())
- return false;
-
- if (!VT.isFixedLengthVector())
+ if (!VT.isFixedLengthVector() || !VT.isSimple())
return false;
// Don't use SVE for vectors we cannot scalarize if required.
@@ -5300,12 +5574,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
// All SVE implementations support NEON sized vectors.
if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
- return true;
+ return Subtarget->hasSVE();
// Ensure NEON MVTs only belong to a single register class.
if (VT.getFixedSizeInBits() <= 128)
return false;
+ // Ensure wider than NEON code generation is enabled.
+ if (!Subtarget->useSVEForFixedLengthVectors())
+ return false;
+
// Don't use SVE for types that don't fit.
if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
return false;
@@ -5322,6 +5600,36 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
+static unsigned getIntrinsicID(const SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ default:
+ return Intrinsic::not_intrinsic;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ if (IID < Intrinsic::num_intrinsics)
+ return IID;
+ return Intrinsic::not_intrinsic;
+ }
+ }
+}
+
+bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const {
+ if (!N0.hasOneUse())
+ return false;
+
+ unsigned IID = getIntrinsicID(N1.getNode());
+ // Avoid reassociating expressions that can be lowered to smlal/umlal.
+ if (IID == Intrinsic::aarch64_neon_umull ||
+ N1.getOpcode() == AArch64ISD::UMULL ||
+ IID == Intrinsic::aarch64_neon_smull ||
+ N1.getOpcode() == AArch64ISD::SMULL)
+ return N0.getOpcode() != ISD::ADD;
+
+ return true;
+}
+
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
@@ -5368,8 +5676,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
+ const Function &F = MF.getFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
- bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
+ bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+ SmallVector<ISD::OutputArg, 4> Outs;
+ GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
+ DAG.getTargetLoweringInfo(), MF.getDataLayout());
+ if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
+ FuncInfo->setIsSVECC(true);
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -5383,7 +5699,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// we use a special version of AnalyzeFormalArguments to pass in ValVT and
// LocVT.
unsigned NumArgs = Ins.size();
- Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
+ Function::const_arg_iterator CurOrigArg = F.arg_begin();
unsigned CurArgIdx = 0;
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Ins[i].VT;
@@ -5454,11 +5770,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
else if (RegVT.isScalableVector() &&
- RegVT.getVectorElementType() == MVT::i1)
+ RegVT.getVectorElementType() == MVT::i1) {
+ FuncInfo->setIsSVECC(true);
RC = &AArch64::PPRRegClass;
- else if (RegVT.isScalableVector())
+ } else if (RegVT.isScalableVector()) {
+ FuncInfo->setIsSVECC(true);
RC = &AArch64::ZPRRegClass;
- else
+ } else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
@@ -5580,7 +5898,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// i1 arguments are zero-extended to i8 by the caller. Emit a
// hint to reflect this.
if (Ins[i].isOrigArg()) {
- Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex());
+ Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
if (OrigArg->getType()->isIntegerTy(1)) {
if (!Ins[i].Flags.isZExt()) {
ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
@@ -5595,7 +5913,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
// varargs
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin() || IsWin64) {
// The AAPCS variadic function ABI is identical to the non-variadic
@@ -5843,14 +6160,62 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
}
}
+static void analyzeCallOperands(const AArch64TargetLowering &TLI,
+ const AArch64Subtarget *Subtarget,
+ const TargetLowering::CallLoweringInfo &CLI,
+ CCState &CCInfo) {
+ const SelectionDAG &DAG = CLI.DAG;
+ CallingConv::ID CalleeCC = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+
+ unsigned NumArgs = Outs.size();
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ MVT ArgVT = Outs[i].VT;
+ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+
+ bool UseVarArgCC = false;
+ if (IsVarArg) {
+ // On Windows, the fixed arguments in a vararg call are passed in GPRs
+ // too, so use the vararg CC to force them to integer registers.
+ if (IsCalleeWin64) {
+ UseVarArgCC = true;
+ } else {
+ UseVarArgCC = !Outs[i].IsFixed;
+ }
+ } else {
+ // Get type of the original argument.
+ EVT ActualVT =
+ TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
+ /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ArgVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ArgVT = MVT::i16;
+ }
+
+ CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
+ bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+ assert(!Res && "Call operand has unhandled type");
+ (void)Res;
+ }
+}
+
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
- SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ const CallLoweringInfo &CLI) const {
+ CallingConv::ID CalleeCC = CLI.CallConv;
if (!mayTailCallThisCC(CalleeCC))
return false;
+ SDValue Callee = CLI.Callee;
+ bool IsVarArg = CLI.IsVarArg;
+ const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ const SelectionDAG &DAG = CLI.DAG;
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -5860,7 +6225,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// The check for matching callee-saved regs will determine whether it is
// eligible for TCO.
if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
- AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
+ MF.getInfo<AArch64FunctionInfo>()->isSVECC())
CallerCC = CallingConv::AArch64_SVE_VectorCall;
bool CCMatch = CallerCC == CalleeCC;
@@ -5915,30 +6280,14 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
- assert((!isVarArg || CalleeCC == CallingConv::C) &&
+ assert((!IsVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
LLVMContext &C = *DAG.getContext();
- if (isVarArg && !Outs.empty()) {
- // At least two cases here: if caller is fastcc then we can't have any
- // memory arguments (we'd be expected to clean up the stack afterwards). If
- // caller is C then we could potentially use its argument area.
-
- // FIXME: for now we take the most conservative of these in both cases:
- // disallow all variadic memory operands.
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
-
- CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
- for (const CCValAssign &ArgLoc : ArgLocs)
- if (!ArgLoc.isRegLoc())
- return false;
- }
-
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
- CCAssignFnForCall(CalleeCC, isVarArg),
- CCAssignFnForCall(CallerCC, isVarArg)))
+ CCAssignFnForCall(CalleeCC, IsVarArg),
+ CCAssignFnForCall(CallerCC, IsVarArg)))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -5958,9 +6307,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return true;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+ CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
+
+ analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
- CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+ if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
+ // When we are musttail, additional checks have been done and we can safely ignore this check
+ // At least two cases here: if caller is fastcc then we can't have any
+ // memory arguments (we'd be expected to clean up the stack afterwards). If
+ // caller is C then we could potentially use its argument area.
+
+ // FIXME: for now we take the most conservative of these in both cases:
+ // disallow all variadic memory operands.
+ for (const CCValAssign &ArgLoc : ArgLocs)
+ if (!ArgLoc.isRegLoc())
+ return false;
+ }
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -6051,7 +6413,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
- CallingConv::ID CallConv = CLI.CallConv;
+ CallingConv::ID &CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
@@ -6061,7 +6423,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
- bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
+ bool GuardWithBTI = false;
+
+ if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
+ !Subtarget->noBTIAtReturnTwice()) {
+ GuardWithBTI = FuncInfo->branchTargetEnforcement();
+ }
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
@@ -6079,8 +6446,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
// Check if it's really possible to do a tail call.
- IsTailCall = isEligibleForTailCallOptimization(
- Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+ IsTailCall = isEligibleForTailCallOptimization(CLI);
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
@@ -6101,56 +6467,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
if (IsVarArg) {
- // Handle fixed and variable vector arguments differently.
- // Variable vector arguments always go into memory.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
- MVT ArgVT = Outs[i].VT;
- if (!Outs[i].IsFixed && ArgVT.isScalableVector())
+ if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
report_fatal_error("Passing SVE types to variadic functions is "
"currently not supported");
-
- ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- bool UseVarArgCC = !Outs[i].IsFixed;
- // On Windows, the fixed arguments in a vararg call are passed in GPRs
- // too, so use the vararg CC to force them to integer registers.
- if (IsCalleeWin64)
- UseVarArgCC = true;
- CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
- bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
- assert(!Res && "Call operand has unhandled type");
- (void)Res;
- }
- } else {
- // At this point, Outs[].VT may already be promoted to i32. To correctly
- // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
- // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
- // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
- // we use a special version of AnalyzeCallOperands to pass in ValVT and
- // LocVT.
- unsigned NumArgs = Outs.size();
- for (unsigned i = 0; i != NumArgs; ++i) {
- MVT ValVT = Outs[i].VT;
- // Get type of the original argument.
- EVT ActualVT = getValueType(DAG.getDataLayout(),
- CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
- /*AllowUnknown*/ true);
- MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
- ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
- if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
- ValVT = MVT::i8;
- else if (ActualMVT == MVT::i16)
- ValVT = MVT::i16;
-
- CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
- bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
- assert(!Res && "Call operand has unhandled type");
- (void)Res;
}
}
+ analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
+
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -6536,7 +6863,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
Ops.insert(Ops.begin() + 1, GA);
- }
+ } else if (GuardWithBTI)
+ CallOpc = AArch64ISD::CALL_BTI;
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
@@ -7313,103 +7641,88 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SelectionDAG &DAG) const {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
EVT VT = Op.getValueType();
+ EVT IntVT = VT.changeTypeToInteger();
SDLoc DL(Op);
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
- if (VT.isScalableVector()) {
- if (VT != SrcVT)
- return SDValue();
+ if (SrcVT.bitsLT(VT))
+ In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+ else if (SrcVT.bitsGT(VT))
+ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
- // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK)
- //
- // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU;
- // maybe useful for copysign operations with mismatched VTs.
- //
- // IntVT here is chosen so it's a legal type with the same element width
- // as the input.
- EVT IntVT =
+ if (VT.isScalableVector())
+ IntVT =
getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
- unsigned NumBits = VT.getScalarSizeInBits();
- SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT);
- SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT);
- SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask,
- getSVESafeBitCast(IntVT, In2, DAG));
- SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask,
- getSVESafeBitCast(IntVT, In1, DAG));
- SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude);
- return getSVESafeBitCast(VT, IntResult, DAG);
- }
- if (!Subtarget->hasNEON())
+ if (VT != In2.getValueType())
return SDValue();
- if (SrcVT.bitsLT(VT))
- In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
- else if (SrcVT.bitsGT(VT))
- In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
+ auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
+ if (VT.isScalableVector())
+ return getSVESafeBitCast(VT, Op, DAG);
- EVT VecVT;
- uint64_t EltMask;
- SDValue VecVal1, VecVal2;
+ return DAG.getBitcast(VT, Op);
+ };
- auto setVecVal = [&] (int Idx) {
+ SDValue VecVal1, VecVal2;
+ EVT VecVT;
+ auto SetVecVal = [&](int Idx = -1) {
if (!VT.isVector()) {
- VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
- DAG.getUNDEF(VecVT), In1);
- VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
- DAG.getUNDEF(VecVT), In2);
+ VecVal1 =
+ DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
+ VecVal2 =
+ DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
} else {
- VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
- VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+ VecVal1 = BitCast(VecVT, In1, DAG);
+ VecVal2 = BitCast(VecVT, In2, DAG);
}
};
-
- if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
- VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
- EltMask = 0x80000000ULL;
- setVecVal(AArch64::ssub);
- } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+ if (VT.isVector()) {
+ VecVT = IntVT;
+ SetVecVal();
+ } else if (VT == MVT::f64) {
VecVT = MVT::v2i64;
-
- // We want to materialize a mask with the high bit set, but the AdvSIMD
- // immediate moves cannot materialize that in a single instruction for
- // 64-bit elements. Instead, materialize zero and then negate it.
- EltMask = 0;
-
- setVecVal(AArch64::dsub);
- } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
- VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
- EltMask = 0x8000ULL;
- setVecVal(AArch64::hsub);
+ SetVecVal(AArch64::dsub);
+ } else if (VT == MVT::f32) {
+ VecVT = MVT::v4i32;
+ SetVecVal(AArch64::ssub);
+ } else if (VT == MVT::f16) {
+ VecVT = MVT::v8i16;
+ SetVecVal(AArch64::hsub);
} else {
llvm_unreachable("Invalid type for copysign!");
}
- SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
+ unsigned BitWidth = In1.getScalarValueSizeInBits();
+ SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
- // If we couldn't materialize the mask above, then the mask vector will be
- // the zero vector, and we need to negate it here.
+ // We want to materialize a mask with every bit but the high bit set, but the
+ // AdvSIMD immediate moves cannot materialize that in a single instruction for
+ // 64-bit elements. Instead, materialize all bits set and then negate that.
if (VT == MVT::f64 || VT == MVT::v2f64) {
- BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
- BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
- BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+ SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
+ SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
+ SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
+ SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
}
- SDValue Sel =
- DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
+ SDValue BSP =
+ DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
if (VT == MVT::f16)
- return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
+ return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
if (VT == MVT::f32)
- return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
- else if (VT == MVT::f64)
- return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
- else
- return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+ return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
+ if (VT == MVT::f64)
+ return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
+
+ return BitCast(VT, BSP, DAG);
}
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
@@ -7485,7 +7798,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
assert(VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
+ useSVEForFixedLengthVectorVT(
+ VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
SDLoc DL(Op);
SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
@@ -7517,22 +7831,19 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
}
if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
+ useSVEForFixedLengthVectorVT(
+ VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
switch (Opcode) {
default:
llvm_unreachable("Wrong instruction");
case ISD::SMAX:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
case ISD::SMIN:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
case ISD::UMAX:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
case ISD::UMIN:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
- /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
}
}
@@ -7547,9 +7858,9 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
EVT VT = Op.getValueType();
if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
- true);
+ useSVEForFixedLengthVectorVT(
+ VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
SDLoc DL(Op);
SDValue REVB;
@@ -8990,12 +9301,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
if (V.isUndef())
continue;
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(V.getOperand(1))) {
+ !isa<ConstantSDNode>(V.getOperand(1)) ||
+ V.getOperand(0).getValueType().isScalableVector()) {
LLVM_DEBUG(
dbgs() << "Reshuffle failed: "
"a shuffle can only come from building a vector from "
- "various elements of other vectors, provided their "
- "indices are constant\n");
+ "various elements of other fixed-width vectors, provided "
+ "their indices are constant\n");
return SDValue();
}
@@ -9011,10 +9323,72 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
+ // If we have 3 or 4 sources, try to generate a TBL, which will at least be
+ // better than moving to/from gpr registers for larger vectors.
+ if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
+ // Construct a mask for the tbl. We may need to adjust the index for types
+ // larger than i8.
+ SmallVector<unsigned, 16> Mask;
+ unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
+ for (unsigned I = 0; I < NumElts; ++I) {
+ SDValue V = Op.getOperand(I);
+ if (V.isUndef()) {
+ for (unsigned OF = 0; OF < OutputFactor; OF++)
+ Mask.push_back(-1);
+ continue;
+ }
+ // Set the Mask lanes adjusted for the size of the input and output
+ // lanes. The Mask is always i8, so it will set OutputFactor lanes per
+ // output element, adjusted in their positions per input and output types.
+ unsigned Lane = V.getConstantOperandVal(1);
+ for (unsigned S = 0; S < Sources.size(); S++) {
+ if (V.getOperand(0) == Sources[S].Vec) {
+ unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
+ unsigned InputBase = 16 * S + Lane * InputSize / 8;
+ for (unsigned OF = 0; OF < OutputFactor; OF++)
+ Mask.push_back(InputBase + OF);
+ break;
+ }
+ }
+ }
+
+ // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
+ // v16i8, and the TBLMask
+ SmallVector<SDValue, 16> TBLOperands;
+ TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
+ ? Intrinsic::aarch64_neon_tbl3
+ : Intrinsic::aarch64_neon_tbl4,
+ dl, MVT::i32));
+ for (unsigned i = 0; i < Sources.size(); i++) {
+ SDValue Src = Sources[i].Vec;
+ EVT SrcVT = Src.getValueType();
+ Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
+ assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
+ "Expected a legally typed vector");
+ if (SrcVT.is64BitVector())
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
+ DAG.getUNDEF(MVT::v8i8));
+ TBLOperands.push_back(Src);
+ }
+
+ SmallVector<SDValue, 16> TBLMask;
+ for (unsigned i = 0; i < Mask.size(); i++)
+ TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
+ assert((Mask.size() == 8 || Mask.size() == 16) &&
+ "Expected a v8i8 or v16i8 Mask");
+ TBLOperands.push_back(
+ DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
+
+ SDValue Shuffle =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
+ Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
+ return DAG.getBitcast(VT, Shuffle);
+ }
+
if (Sources.size() > 2) {
- LLVM_DEBUG(
- dbgs() << "Reshuffle failed: currently only do something sane when at "
- "most two source vectors are involved\n");
+ LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
+ << "sensible when at most two source vectors are "
+ << "involved\n");
return SDValue();
}
@@ -9039,8 +9413,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
- uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
- if (SrcVTSize == VTSize)
+ TypeSize SrcVTSize = SrcVT.getSizeInBits();
+ if (SrcVTSize == TypeSize::Fixed(VTSize))
continue;
// This stage of the search produces a source with the same element type as
@@ -9049,7 +9423,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
- if (SrcVTSize < VTSize) {
+ if (SrcVTSize.getFixedValue() < VTSize) {
assert(2 * SrcVTSize == VTSize);
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
@@ -9059,7 +9433,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
continue;
}
- if (SrcVTSize != 2 * VTSize) {
+ if (SrcVTSize.getFixedValue() != 2 * VTSize) {
LLVM_DEBUG(
dbgs() << "Reshuffle failed: result vector too small to extract\n");
return SDValue();
@@ -9205,6 +9579,56 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
return true;
}
+// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+// v4i32s. This is really a truncate, which we can construct out of (legal)
+// concats and truncate nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+ if (V.getValueType() != MVT::v16i8)
+ return SDValue();
+ assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
+
+ for (unsigned X = 0; X < 4; X++) {
+ // Check the first item in each group is an extract from lane 0 of a v4i32
+ // or v4i16.
+ SDValue BaseExt = V.getOperand(X * 4);
+ if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
+ BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
+ !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
+ BaseExt.getConstantOperandVal(1) != 0)
+ return SDValue();
+ SDValue Base = BaseExt.getOperand(0);
+ // And check the other items are extracts from the same vector.
+ for (unsigned Y = 1; Y < 4; Y++) {
+ SDValue Ext = V.getOperand(X * 4 + Y);
+ if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Ext.getOperand(0) != Base ||
+ !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+ Ext.getConstantOperandVal(1) != Y)
+ return SDValue();
+ }
+ }
+
+ // Turn the buildvector into a series of truncates and concates, which will
+ // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
+ // concat together to produce 2 v8i16. These are both truncated and concat
+ // together.
+ SDLoc DL(V);
+ SDValue Trunc[4] = {
+ V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
+ V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
+ for (int I = 0; I < 4; I++)
+ if (Trunc[I].getValueType() == MVT::v4i32)
+ Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
+ SDValue Concat0 =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
+ SDValue Concat1 =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+}
+
/// Check if a vector shuffle corresponds to a DUP instructions with a larger
/// element width than the vector lane type. If that is the case the function
/// returns true and writes the value of the DUP instruction lane operand into
@@ -9534,8 +9958,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+/// the specified operations to build the shuffle. ID is the perfect-shuffle
+//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
+//table entry and LHS/RHS are the immediate inputs for this stage of the
+//shuffle.
+static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
+ SDValue V2, unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
@@ -9552,12 +9980,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR // VTRN, right result
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR, // VTRN, right result
+ OP_MOVLANE // Move lane. RHSID is the lane to move into
};
if (OpNum == OP_COPY) {
@@ -9567,9 +9996,71 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return RHS;
}
+ if (OpNum == OP_MOVLANE) {
+ // Decompose a PerfectShuffle ID to get the Mask for lane Elt
+ auto getPFIDLane = [](unsigned ID, int Elt) -> int {
+ assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
+ Elt = 3 - Elt;
+ while (Elt > 0) {
+ ID /= 9;
+ Elt--;
+ }
+ return (ID % 9 == 8) ? -1 : ID % 9;
+ };
+
+ // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
+ // get the lane to move from from the PFID, which is always from the
+ // original vectors (V1 or V2).
+ SDValue OpLHS = GeneratePerfectShuffle(
+ LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+ EVT VT = OpLHS.getValueType();
+ assert(RHSID < 8 && "Expected a lane index for RHSID!");
+ unsigned ExtLane = 0;
+ SDValue Input;
+
+ // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+ // convert into a higher type.
+ if (RHSID & 0x4) {
+ int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+ if (MaskElt == -1)
+ MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+ Input = MaskElt < 2 ? V1 : V2;
+ if (VT.getScalarSizeInBits() == 16) {
+ Input = DAG.getBitcast(MVT::v2f32, Input);
+ OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
+ } else {
+ assert(VT.getScalarSizeInBits() == 32 &&
+ "Expected 16 or 32 bit shuffle elemements");
+ Input = DAG.getBitcast(MVT::v2f64, Input);
+ OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
+ }
+ } else {
+ int MaskElt = getPFIDLane(ID, RHSID);
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+ Input = MaskElt < 4 ? V1 : V2;
+ // Be careful about creating illegal types. Use f16 instead of i16.
+ if (VT == MVT::v4i16) {
+ Input = DAG.getBitcast(MVT::v4f16, Input);
+ OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+ }
+ }
+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ Input.getValueType().getVectorElementType(),
+ Input, DAG.getVectorIdxConstant(ExtLane, dl));
+ SDValue Ins =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
+ Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
+ return DAG.getBitcast(VT, Ins);
+ }
+
SDValue OpLHS, OpRHS;
- OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
- OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+ OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
+ RHS, DAG, dl);
+ OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
+ RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
switch (OpNum) {
@@ -9648,14 +10139,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
EVT EltVT = Op.getValueType().getVectorElementType();
unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
- SmallVector<SDValue, 8> TBLMask;
- for (int Val : ShuffleMask) {
- for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
- unsigned Offset = Byte + Val * BytesPerElt;
- TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
- }
+ bool Swap = false;
+ if (V1.isUndef() || isZerosVector(V1.getNode())) {
+ std::swap(V1, V2);
+ Swap = true;
}
+ // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
+ // out of range values with 0s. We do need to make sure that any out-of-range
+ // values are really out-of-range for a v16i8 vector.
+ bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
MVT IndexVT = MVT::v8i8;
unsigned IndexLen = 8;
if (Op.getValueSizeInBits() == 128) {
@@ -9663,11 +10156,23 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
IndexLen = 16;
}
+ SmallVector<SDValue, 8> TBLMask;
+ for (int Val : ShuffleMask) {
+ for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+ unsigned Offset = Byte + Val * BytesPerElt;
+ if (Swap)
+ Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
+ if (IsUndefOrZero && Offset >= IndexLen)
+ Offset = 255;
+ TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
+ }
+ }
+
SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
- if (V2.getNode()->isUndef()) {
+ if (IsUndefOrZero) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
@@ -9732,6 +10237,10 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
if (ExtIdxInBits % CastedEltBitWidth != 0)
return false;
+ // Can't handle cases where vector size is not 128-bit
+ if (!Extract.getOperand(0).getValueType().is128BitVector())
+ return false;
+
// Update the lane value by offsetting with the scaled extract index.
LaneC += ExtIdxInBits / CastedEltBitWidth;
@@ -10014,10 +10523,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
- unsigned Cost = (PFEntry >> 30);
-
- if (Cost <= 4)
- return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
+ dl);
}
return GenerateTBL(Op, ShuffleMask, DAG);
@@ -10025,56 +10532,33 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc dl(Op);
EVT VT = Op.getValueType();
- EVT ElemVT = VT.getScalarType();
- SDValue SplatVal = Op.getOperand(0);
if (useSVEForFixedLengthVectorVT(VT))
return LowerToScalableOp(Op, DAG);
- // Extend input splat value where needed to fit into a GPR (32b or 64b only)
- // FPRs don't have this restriction.
- switch (ElemVT.getSimpleVT().SimpleTy) {
- case MVT::i1: {
- // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
- // lowering code.
- if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
- // We can hande the zero case during isel.
- if (ConstVal->isZero())
- return Op;
- if (ConstVal->isOne())
- return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
- }
- // The general case of i1. There isn't any natural way to do this,
- // so we use some trickery with whilelo.
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
- SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
- DAG.getValueType(MVT::i1));
- SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
- MVT::i64);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
- DAG.getConstant(0, dl, MVT::i64), SplatVal);
- }
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
- break;
- case MVT::i64:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
- break;
- case MVT::f16:
- case MVT::bf16:
- case MVT::f32:
- case MVT::f64:
- // Fine as is
- break;
- default:
- report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
- }
+ assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
+ "Unexpected vector type!");
+
+ // We can handle the constant cases during isel.
+ if (isa<ConstantSDNode>(Op.getOperand(0)))
+ return Op;
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+ // There isn't a natural way to handle the general i1 case, so we use some
+ // trickery with whilelo.
+ SDLoc DL(Op);
+ SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
+ SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
+ DAG.getValueType(MVT::i1));
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ if (VT == MVT::nxv1i1)
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
+ Zero, SplatVal),
+ Zero);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
}
SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
@@ -10090,18 +10574,17 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
return SDValue();
// The DUPQ operation is indepedent of element type so normalise to i64s.
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
SDValue Idx128 = Op.getOperand(2);
// DUPQ can be used when idx is in range.
auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
if (CIdx && (CIdx->getZExtValue() <= 3)) {
SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
- SDNode *DUPQ =
- DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
- return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+ return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
}
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+
// The ACLE says this must produce the same result as:
// svtbl(data, svadd_x(svptrue_b64(),
// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
@@ -10358,20 +10841,6 @@ static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
return true;
}
-static unsigned getIntrinsicID(const SDNode *N) {
- unsigned Opcode = N->getOpcode();
- switch (Opcode) {
- default:
- return Intrinsic::not_intrinsic;
- case ISD::INTRINSIC_WO_CHAIN: {
- unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID < Intrinsic::num_intrinsics)
- return IID;
- return Intrinsic::not_intrinsic;
- }
- }
-}
-
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
// BUILD_VECTORs with constant element C1, C2 is a constant, and:
@@ -10822,6 +11291,12 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return SDValue();
}
+ // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+ // v4i32s. This is really a truncate, which we can construct out of (legal)
+ // concats and truncate nodes.
+ if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+ return M;
+
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
@@ -11121,29 +11596,36 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
return SDValue();
- EVT WideVT;
- SDValue ExtVec;
+ // Here narrow and wide refers to the vector element types. After "casting"
+ // both vectors must have the same bit length and so because the subvector
+ // has fewer elements, those elements need to be bigger.
+ EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+ EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
+ // NOP cast operands to the largest legal vector of the same element count.
if (VT.isFloatingPoint()) {
- // The InVT type should be legal. We can safely cast the unpacked
- // subvector from InVT -> VT.
- WideVT = VT;
- ExtVec = getSVESafeBitCast(VT, Vec1, DAG);
+ Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
+ Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
} else {
- // Extend elements of smaller vector...
- WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
- ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+ // Legal integer vectors are already their largest so Vec0 is fine as is.
+ Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
}
+ // To replace the top/bottom half of vector V with vector SubV we widen the
+ // preserved half of V, concatenate this to SubV (the order depending on the
+ // half being replaced) and then narrow the result.
+ SDValue Narrow;
if (Idx == 0) {
SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
- return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
- } else if (Idx == InVT.getVectorMinNumElements()) {
+ Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
+ } else {
+ assert(Idx == InVT.getVectorMinNumElements() &&
+ "Invalid subvector index!");
SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
- return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
+ Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
}
- return SDValue();
+ return getSVESafeBitCast(VT, Narrow, DAG);
}
if (Idx == 0 && isPackedVectorType(VT, DAG)) {
@@ -11249,21 +11731,8 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
- unsigned PFIndexes[4];
- for (unsigned i = 0; i != 4; ++i) {
- if (M[i] < 0)
- PFIndexes[i] = 8;
- else
- PFIndexes[i] = M[i];
- }
-
- // Compute the index in the perfect shuffle table.
- unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
- PFIndexes[2] * 9 + PFIndexes[3];
- unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
- unsigned Cost = (PFEntry >> 30);
-
- if (Cost <= 4)
+ unsigned Cost = getPerfectShuffleCost(M);
+ if (Cost <= 1)
return true;
}
@@ -11360,9 +11829,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
unsigned EltSize = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
- default:
- llvm_unreachable("unexpected shift opcode");
-
case ISD::SHL:
if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
@@ -11405,7 +11871,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
return NegShiftLeft;
}
- return SDValue();
+ llvm_unreachable("unexpected shift opcode");
}
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
@@ -11525,8 +11991,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
- const bool FullFP16 =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+ const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
// Make v4f16 (only) fcmp operations utilise vector instructions
// v8f16 support will be a litle more complicated
@@ -11594,7 +12059,8 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
(Op.getOpcode() != ISD::VECREDUCE_ADD &&
SrcVT.getVectorElementType() == MVT::i64);
if (SrcVT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+ useSVEForFixedLengthVectorVT(
+ SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
if (SrcVT.getVectorElementType() == MVT::i1)
return LowerPredReductionToSVE(Op, DAG);
@@ -11659,7 +12125,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SelectionDAG &DAG) const {
- auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
return SDValue();
@@ -11676,7 +12142,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
- auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
return SDValue();
@@ -11772,8 +12238,8 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
SDLoc DL(Op);
APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
- return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
- DL, VT);
+ return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
+ VT);
}
/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
@@ -11867,23 +12333,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
- PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+ Type *ValTy = I.getParamElementType(0);
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+ Info.memVT = MVT::getVT(ValTy);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+ Info.align = DL.getABITypeAlign(ValTy);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
- PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Type *ValTy = I.getParamElementType(1);
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+ Info.memVT = MVT::getVT(ValTy);
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+ Info.align = DL.getABITypeAlign(ValTy);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -11906,22 +12372,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_sve_ldnt1: {
- PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Type *ElTy = cast<VectorType>(I.getType())->getElementType();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+ Info.align = DL.getABITypeAlign(ElTy);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
return true;
}
case Intrinsic::aarch64_sve_stnt1: {
- PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
+ Type *ElTy =
+ cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
+ Info.align = DL.getABITypeAlign(ElTy);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
return true;
}
@@ -12007,8 +12474,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
Instruction *User = I->user_back();
- if (User &&
- !(User->getOpcode() == Instruction::FSub ||
+ if (!(User->getOpcode() == Instruction::FSub ||
User->getOpcode() == Instruction::FAdd))
return true;
@@ -12194,9 +12660,6 @@ static bool isSplatShuffle(Value *V) {
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
bool AArch64TargetLowering::shouldSinkOperands(
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
- if (!I->getType()->isVectorTy())
- return false;
-
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::aarch64_neon_smull:
@@ -12208,6 +12671,12 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
LLVM_FALLTHROUGH;
+ case Intrinsic::fma:
+ if (isa<VectorType>(I->getType()) &&
+ cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
+ !Subtarget->hasFullFP16())
+ return false;
+ LLVM_FALLTHROUGH;
case Intrinsic::aarch64_neon_sqdmull:
case Intrinsic::aarch64_neon_sqdmulh:
case Intrinsic::aarch64_neon_sqrdmulh:
@@ -12217,7 +12686,52 @@ bool AArch64TargetLowering::shouldSinkOperands(
if (isSplatShuffle(II->getOperand(1)))
Ops.push_back(&II->getOperandUse(1));
return !Ops.empty();
-
+ case Intrinsic::aarch64_sme_write_horiz:
+ case Intrinsic::aarch64_sme_write_vert:
+ case Intrinsic::aarch64_sme_writeq_horiz:
+ case Intrinsic::aarch64_sme_writeq_vert: {
+ auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
+ if (!Idx || Idx->getOpcode() != Instruction::Add)
+ return false;
+ Ops.push_back(&II->getOperandUse(1));
+ return true;
+ }
+ case Intrinsic::aarch64_sme_read_horiz:
+ case Intrinsic::aarch64_sme_read_vert:
+ case Intrinsic::aarch64_sme_readq_horiz:
+ case Intrinsic::aarch64_sme_readq_vert:
+ case Intrinsic::aarch64_sme_ld1b_vert:
+ case Intrinsic::aarch64_sme_ld1h_vert:
+ case Intrinsic::aarch64_sme_ld1w_vert:
+ case Intrinsic::aarch64_sme_ld1d_vert:
+ case Intrinsic::aarch64_sme_ld1q_vert:
+ case Intrinsic::aarch64_sme_st1b_vert:
+ case Intrinsic::aarch64_sme_st1h_vert:
+ case Intrinsic::aarch64_sme_st1w_vert:
+ case Intrinsic::aarch64_sme_st1d_vert:
+ case Intrinsic::aarch64_sme_st1q_vert:
+ case Intrinsic::aarch64_sme_ld1b_horiz:
+ case Intrinsic::aarch64_sme_ld1h_horiz:
+ case Intrinsic::aarch64_sme_ld1w_horiz:
+ case Intrinsic::aarch64_sme_ld1d_horiz:
+ case Intrinsic::aarch64_sme_ld1q_horiz:
+ case Intrinsic::aarch64_sme_st1b_horiz:
+ case Intrinsic::aarch64_sme_st1h_horiz:
+ case Intrinsic::aarch64_sme_st1w_horiz:
+ case Intrinsic::aarch64_sme_st1d_horiz:
+ case Intrinsic::aarch64_sme_st1q_horiz: {
+ auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
+ if (!Idx || Idx->getOpcode() != Instruction::Add)
+ return false;
+ Ops.push_back(&II->getOperandUse(3));
+ return true;
+ }
+ case Intrinsic::aarch64_neon_pmull:
+ if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
+ return false;
+ Ops.push_back(&II->getOperandUse(0));
+ Ops.push_back(&II->getOperandUse(1));
+ return true;
case Intrinsic::aarch64_neon_pmull64:
if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
II->getArgOperand(1)))
@@ -12225,12 +12739,14 @@ bool AArch64TargetLowering::shouldSinkOperands(
Ops.push_back(&II->getArgOperandUse(0));
Ops.push_back(&II->getArgOperandUse(1));
return true;
-
default:
return false;
}
}
+ if (!I->getType()->isVectorTy())
+ return false;
+
switch (I->getOpcode()) {
case Instruction::Sub:
case Instruction::Add: {
@@ -12745,12 +13261,15 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
assert(VT.isScalableVector() && "Can only lower scalable vectors");
unsigned N, Opcode;
- static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
- {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
- {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
- {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
-
- std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+ static const std::pair<unsigned, std::pair<unsigned, unsigned>>
+ IntrinsicMap[] = {
+ {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+ std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) {
+ return P.first == Intrinsic;
+ })->second;
assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
"invalid tuple vector type!");
@@ -12850,7 +13369,7 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
// if the folding leads to worse code.
bool AArch64TargetLowering::isMulAddWithConstProfitable(
- const SDValue &AddNode, const SDValue &ConstNode) const {
+ SDValue AddNode, SDValue ConstNode) const {
// Let the DAGCombiner decide for vector types and large types.
const EVT VT = AddNode.getValueType();
if (VT.isVector() || VT.getScalarSizeInBits() > 64)
@@ -13025,6 +13544,28 @@ AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return true;
}
+bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ // Don't allow multiuse shift folding with the same shift amount.
+ if (!N->getOperand(0)->hasOneUse())
+ return false;
+
+ // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
+ EVT VT = N->getValueType(0);
+ if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
+ auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+ auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
+ }
+
+ return true;
+}
+
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
@@ -13221,6 +13762,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
}
+// Given an (integer) vecreduce, we know the order of the inputs does not
+// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
+// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
+// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
+static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+ auto DetectAddExtract = [&](SDValue A) {
+ // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
+ // UADDLP(x) if found.
+ if (A.getOpcode() != ISD::ADD)
+ return SDValue();
+ EVT VT = A.getValueType();
+ SDValue Op0 = A.getOperand(0);
+ SDValue Op1 = A.getOperand(1);
+ if (Op0.getOpcode() != Op0.getOpcode() ||
+ (Op0.getOpcode() != ISD::ZERO_EXTEND &&
+ Op0.getOpcode() != ISD::SIGN_EXTEND))
+ return SDValue();
+ SDValue Ext0 = Op0.getOperand(0);
+ SDValue Ext1 = Op1.getOperand(0);
+ if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Ext0.getOperand(0) != Ext1.getOperand(0))
+ return SDValue();
+ // Check that the type is twice the add types, and the extract are from
+ // upper/lower parts of the same source.
+ if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
+ VT.getVectorNumElements() * 2)
+ return SDValue();
+ if ((Ext0.getConstantOperandVal(1) != 0 &&
+ Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
+ (Ext1.getConstantOperandVal(1) != 0 &&
+ Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
+ return SDValue();
+ unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
+ : AArch64ISD::SADDLP;
+ return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
+ };
+
+ SDValue A = N->getOperand(0);
+ if (SDValue R = DetectAddExtract(A))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
+ if (A.getOpcode() == ISD::ADD) {
+ if (SDValue R = DetectAddExtract(A.getOperand(0)))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+ A.getOperand(1)));
+ if (SDValue R = DetectAddExtract(A.getOperand(1)))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
+ A.getOperand(0)));
+ }
+ return SDValue();
+}
+
+
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -13279,6 +13875,60 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
}
+SDValue
+AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N, 0); // Lower SREM as SREM
+
+ EVT VT = N->getValueType(0);
+
+ // For scalable and fixed types, mark them as cheap so we can handle it much
+ // later. This allows us to handle larger than legal types.
+ if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
+ return SDValue(N, 0);
+
+ // fold (srem X, pow2)
+ if ((VT != MVT::i32 && VT != MVT::i64) ||
+ !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
+ return SDValue();
+
+ unsigned Lg2 = Divisor.countTrailingZeros();
+ if (Lg2 == 0)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue CCVal, CSNeg;
+ if (Lg2 == 1) {
+ SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
+ CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
+
+ Created.push_back(Cmp.getNode());
+ Created.push_back(And.getNode());
+ } else {
+ SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
+ SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
+ SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
+ CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
+ Negs.getValue(1));
+
+ Created.push_back(Negs.getNode());
+ Created.push_back(AndPos.getNode());
+ Created.push_back(AndNeg.getNode());
+ }
+
+ return CSNeg;
+}
+
static bool IsSVECntIntrinsic(SDValue S) {
switch(getIntrinsicID(S.getNode())) {
default:
@@ -13300,11 +13950,10 @@ static bool IsSVECntIntrinsic(SDValue S) {
/// operations need a bit more inspection to get this information.
///
/// \param Extend The SDNode from the DAG that represents the extend operation
-/// \param DAG The SelectionDAG hosting the \p Extend node
///
/// \returns The type representing the \p Extend source type, or \p MVT::Other
/// if no valid type can be determined
-static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+static EVT calculatePreExtendType(SDValue Extend) {
switch (Extend.getOpcode()) {
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
@@ -13337,102 +13986,90 @@ static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
default:
return MVT::Other;
}
-
- llvm_unreachable("Code path unhandled in calculatePreExtendType!");
}
-/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
-/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
-static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
- SelectionDAG &DAG) {
-
- ShuffleVectorSDNode *ShuffleNode =
- dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
- if (!ShuffleNode)
- return SDValue();
-
- // Ensuring the mask is zero before continuing
- if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
- return SDValue();
-
- SDValue InsertVectorElt = VectorShuffle.getOperand(0);
-
- if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
- return SDValue();
-
- SDValue InsertLane = InsertVectorElt.getOperand(2);
- ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
- // Ensures the insert is inserting into lane 0
- if (!Constant || Constant->getZExtValue() != 0)
+/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
+/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
+/// SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
+ EVT VT = BV.getValueType();
+ if (BV.getOpcode() != ISD::BUILD_VECTOR &&
+ BV.getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
- SDValue Extend = InsertVectorElt.getOperand(1);
+ // Use the first item in the buildvector/shuffle to get the size of the
+ // extend, and make sure it looks valid.
+ SDValue Extend = BV->getOperand(0);
unsigned ExtendOpcode = Extend.getOpcode();
-
bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
ExtendOpcode == ISD::AssertSext;
if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
return SDValue();
-
- EVT TargetType = VectorShuffle.getValueType();
- EVT PreExtendType = calculatePreExtendType(Extend, DAG);
-
- if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
- TargetType != MVT::v2i64) ||
- (PreExtendType == MVT::Other))
+ // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
+ // calculatePreExtendType will work without issue.
+ if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
return SDValue();
// Restrict valid pre-extend data type
- if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
- PreExtendType != MVT::i32)
- return SDValue();
-
- EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
-
- if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
- return SDValue();
-
- if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+ EVT PreExtendType = calculatePreExtendType(Extend);
+ if (PreExtendType == MVT::Other ||
+ PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
return SDValue();
- SDLoc DL(VectorShuffle);
-
- SDValue InsertVectorNode = DAG.getNode(
- InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
- DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
- DAG.getConstant(0, DL, MVT::i64));
-
- std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
-
- SDValue VectorShuffleNode =
- DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
- DAG.getUNDEF(PreExtendVT), ShuffleMask);
-
- SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
- DL, TargetType, VectorShuffleNode);
+ // Make sure all other operands are equally extended
+ for (SDValue Op : drop_begin(BV->ops())) {
+ if (Op.isUndef())
+ continue;
+ unsigned Opc = Op.getOpcode();
+ bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
+ Opc == ISD::AssertSext;
+ if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
+ return SDValue();
+ }
- return ExtendNode;
+ SDValue NBV;
+ SDLoc DL(BV);
+ if (BV.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
+ EVT PreExtendLegalType =
+ PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
+ SmallVector<SDValue, 8> NewOps;
+ for (SDValue Op : BV->ops())
+ NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
+ : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
+ PreExtendLegalType));
+ NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
+ } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
+ EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
+ NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
+ BV.getOperand(1).isUndef()
+ ? DAG.getUNDEF(PreExtendVT)
+ : BV.getOperand(1).getOperand(0),
+ cast<ShuffleVectorSDNode>(BV)->getMask());
+ }
+ return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
}
/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
// If the value type isn't a vector, none of the operands are going to be dups
- if (!Mul->getValueType(0).isVector())
+ EVT VT = Mul->getValueType(0);
+ if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
return SDValue();
- SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
- SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+ SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
+ SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
// Neither operands have been changed, don't make any further changes
if (!Op0 && !Op1)
return SDValue();
SDLoc DL(Mul);
- return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
- Op0 ? Op0 : Mul->getOperand(0),
+ return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
Op1 ? Op1 : Mul->getOperand(1));
}
@@ -13649,7 +14286,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getPointerInfo(), LN0->getAlignment(),
+ LN0->getPointerInfo(), LN0->getAlign(),
LN0->getMemOperand()->getFlags());
// Make sure successors of the original load stay after it by updating them
@@ -13676,8 +14313,10 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
SDValue Op = N->getOperand(0);
- if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
- Op.getOpcode() != ISD::FMUL)
+ if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
+ return SDValue();
+
+ if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
return SDValue();
SDValue ConstVec = Op->getOperand(1);
@@ -13713,7 +14352,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
N->getOpcode() == ISD::FP_TO_UINT_SAT) {
EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
- if (SatVT.getScalarSizeInBits() != IntBits)
+ if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
return SDValue();
}
@@ -13956,15 +14595,85 @@ static SDValue tryCombineToBSL(SDNode *N,
return SDValue();
}
+// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
+// convert to csel(ccmp(.., cc0)), depending on cc1:
+
+// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
+// =>
+// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
+//
+// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
+// =>
+// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
+static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue CSel0 = N->getOperand(0);
+ SDValue CSel1 = N->getOperand(1);
+
+ if (CSel0.getOpcode() != AArch64ISD::CSEL ||
+ CSel1.getOpcode() != AArch64ISD::CSEL)
+ return SDValue();
+
+ if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
+ return SDValue();
+
+ if (!isNullConstant(CSel0.getOperand(0)) ||
+ !isOneConstant(CSel0.getOperand(1)) ||
+ !isNullConstant(CSel1.getOperand(0)) ||
+ !isOneConstant(CSel1.getOperand(1)))
+ return SDValue();
+
+ SDValue Cmp0 = CSel0.getOperand(3);
+ SDValue Cmp1 = CSel1.getOperand(3);
+ AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
+ AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
+ if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
+ return SDValue();
+ if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
+ Cmp0.getOpcode() == AArch64ISD::SUBS) {
+ std::swap(Cmp0, Cmp1);
+ std::swap(CC0, CC1);
+ }
+
+ if (Cmp1.getOpcode() != AArch64ISD::SUBS)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue CCmp;
+
+ if (N->getOpcode() == ISD::AND) {
+ AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
+ SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
+ SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
+ } else {
+ SDLoc DL(N);
+ AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
+ SDValue Condition = DAG.getConstant(CC0, DL, MVT_CC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
+ SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
+ }
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
+ CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
+ CCmp);
+}
+
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
- // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
+ if (SDValue R = performANDORCSELCombine(N, DAG))
+ return R;
+
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
+ // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
@@ -14015,7 +14724,7 @@ static SDValue performSVEAndCombine(SDNode *N,
SDValue UnpkOp = Src->getOperand(0);
SDValue Dup = N->getOperand(1);
- if (Dup.getOpcode() != AArch64ISD::DUP)
+ if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
return SDValue();
SDLoc DL(N);
@@ -14038,8 +14747,7 @@ static SDValue performSVEAndCombine(SDNode *N,
// Otherwise, make sure we propagate the AND to the operand
// of the unpack
- Dup = DAG.getNode(AArch64ISD::DUP, DL,
- UnpkOp->getValueType(0),
+ Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
SDValue And = DAG.getNode(ISD::AND, DL,
@@ -14097,20 +14805,34 @@ static SDValue performANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+
+ if (SDValue R = performANDORCSELCombine(N, DAG))
+ return R;
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
+ // Although NEON has no EORV instruction, when only the least significant bit
+ // is required the operation is synonymous with ADDV.
+ if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) &&
+ LHS.getOperand(0).getValueType().isFixedLengthVector() &&
+ LHS.hasOneUse()) {
+ SDLoc DL(N);
+ SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0));
+ return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS);
+ }
+
if (VT.isScalableVector())
return performSVEAndCombine(N, DCI);
// The combining code below works only for NEON vectors. In particular, it
// does not work for SVE when dealing with vectors wider than 128 bits.
- if (!(VT.is64BitVector() || VT.is128BitVector()))
+ if (!VT.is64BitVector() && !VT.is128BitVector())
return SDValue();
- BuildVectorSDNode *BVN =
- dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
if (!BVN)
return SDValue();
@@ -14141,107 +14863,125 @@ static SDValue performANDCombine(SDNode *N,
return SDValue();
}
-// Attempt to form urhadd(OpA, OpB) from
-// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
-// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
-// The original form of the first expression is
-// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
-// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
-// Before this function is called the srl will have been lowered to
-// AArch64ISD::VLSHR.
-// This pass can also recognize signed variants of the patterns that use sign
-// extension instead of zero extension and form a srhadd(OpA, OpB) or a
-// shadd(OpA, OpB) from them.
-static SDValue
-performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
+static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
+ switch (Opcode) {
+ case ISD::STRICT_FADD:
+ case ISD::FADD:
+ return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
+ case ISD::ADD:
+ return VT == MVT::i64;
+ default:
+ return false;
+ }
+}
- // Since we are looking for a right shift by a constant value of 1 and we are
- // operating on types at least 16 bits in length (sign/zero extended OpA and
- // OpB, which are at least 8 bits), it follows that the truncate will always
- // discard the shifted-in bit and therefore the right shift will be logical
- // regardless of the signedness of OpA and OpB.
- SDValue Shift = N->getOperand(0);
- if (Shift.getOpcode() != AArch64ISD::VLSHR)
+static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
+ AArch64CC::CondCode Cond);
+
+static bool isPredicateCCSettingOp(SDValue N) {
+ if ((N.getOpcode() == ISD::SETCC) ||
+ (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
+ N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
+ // get_active_lane_mask is lowered to a whilelo instruction.
+ N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
+ return true;
+
+ return false;
+}
+
+// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
+// ... into: "ptrue p, all" + PTEST
+static SDValue
+performFirstTrueTestVectorCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+ // Make sure PTEST can be legalised with illegal types.
+ if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
return SDValue();
- // Is the right shift using an immediate value of 1?
- uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
- if (ShiftAmount != 1)
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType();
+
+ if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
+ !isNullConstant(N->getOperand(1)))
return SDValue();
- SDValue ExtendOpA, ExtendOpB;
- SDValue ShiftOp0 = Shift.getOperand(0);
- unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
- if (ShiftOp0Opc == ISD::SUB) {
+ // Restricted the DAG combine to only cases where we're extracting from a
+ // flag-setting operation.
+ if (!isPredicateCCSettingOp(N0))
+ return SDValue();
- SDValue Xor = ShiftOp0.getOperand(1);
- if (Xor.getOpcode() != ISD::XOR)
- return SDValue();
+ // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
+ return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
+}
- // Is the XOR using a constant amount of all ones in the right hand side?
- uint64_t C;
- if (!isAllConstantBuildVector(Xor.getOperand(1), C))
- return SDValue();
+// Materialize : Idx = (add (mul vscale, NumEls), -1)
+// i1 = extract_vector_elt t37, Constant:i64<Idx>
+// ... into: "ptrue p, all" + PTEST
+static SDValue
+performLastTrueTestVectorCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+ // Make sure PTEST is legal types.
+ if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
+ return SDValue();
- unsigned ElemSizeInBits = VT.getScalarSizeInBits();
- APInt CAsAPInt(ElemSizeInBits, C);
- if (CAsAPInt != APInt::getAllOnes(ElemSizeInBits))
- return SDValue();
+ SDValue N0 = N->getOperand(0);
+ EVT OpVT = N0.getValueType();
- ExtendOpA = Xor.getOperand(0);
- ExtendOpB = ShiftOp0.getOperand(0);
- } else if (ShiftOp0Opc == ISD::ADD) {
- ExtendOpA = ShiftOp0.getOperand(0);
- ExtendOpB = ShiftOp0.getOperand(1);
- } else
+ if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
return SDValue();
- unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
- unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
- if (!(ExtendOpAOpc == ExtendOpBOpc &&
- (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+ // Idx == (add (mul vscale, NumEls), -1)
+ SDValue Idx = N->getOperand(1);
+ if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
return SDValue();
- // Is the result of the right shift being truncated to the same value type as
- // the original operands, OpA and OpB?
- SDValue OpA = ExtendOpA.getOperand(0);
- SDValue OpB = ExtendOpB.getOperand(0);
- EVT OpAVT = OpA.getValueType();
- assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
- if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+ SDValue VS = Idx.getOperand(0);
+ if (VS.getOpcode() != ISD::VSCALE)
return SDValue();
- SDLoc DL(N);
- bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
- bool IsRHADD = ShiftOp0Opc == ISD::SUB;
- unsigned HADDOpc = IsSignExtend
- ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
- : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
- SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
+ unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
+ if (VS.getConstantOperandVal(0) != NumEls)
+ return SDValue();
- return ResultHADD;
+ // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
+ return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
}
-static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
- switch (Opcode) {
- case ISD::FADD:
- return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
- case ISD::ADD:
- return VT == MVT::i64;
- default:
- return false;
- }
-}
+static SDValue
+performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+ if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
+ return Res;
+ if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
+ return Res;
-static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
+ SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
EVT VT = N->getValueType(0);
- const bool FullFP16 =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+ const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
+ bool IsStrict = N0->isStrictFPOpcode();
+
+ // extract(dup x) -> x
+ if (N0.getOpcode() == AArch64ISD::DUP)
+ return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
// Rewrite for pairwise fadd pattern
// (f32 (extract_vector_elt
@@ -14250,11 +14990,14 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
// ->
// (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
// (extract_vector_elt (vXf32 Other) 1))
+ // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
+ // we can only do this when it's used only by the extract_vector_elt.
if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
- hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
+ hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
+ (!IsStrict || N0.hasOneUse())) {
SDLoc DL(N0);
- SDValue N00 = N0->getOperand(0);
- SDValue N01 = N0->getOperand(1);
+ SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
+ SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
SDValue Other = N00;
@@ -14267,11 +15010,23 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
Other == Shuffle->getOperand(0)) {
- return DAG.getNode(N0->getOpcode(), DL, VT,
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
- DAG.getConstant(0, DL, MVT::i64)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
- DAG.getConstant(1, DL, MVT::i64)));
+ SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+ DAG.getConstant(1, DL, MVT::i64));
+ if (!IsStrict)
+ return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
+
+ // For strict_fadd we need uses of the final extract_vector to be replaced
+ // with the strict_fadd, but we also need uses of the chain output of the
+ // original strict_fadd to use the chain output of the new strict_fadd as
+ // otherwise it may not be deleted.
+ SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
+ {VT, MVT::Other},
+ {N0->getOperand(0), Extract1, Extract2});
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
+ return SDValue(N, 0);
}
}
@@ -14321,25 +15076,61 @@ static SDValue performConcatVectorsCombine(SDNode *N,
}
}
+ if (N->getOperand(0).getValueType() == MVT::v4i8) {
+ // If we have a concat of v4i8 loads, convert them to a buildvector of f32
+ // loads to prevent having to go through the v4i8 load legalization that
+ // needs to extend each element into a larger type.
+ if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
+ if (V.getValueType() != MVT::v4i8)
+ return false;
+ if (V.isUndef())
+ return true;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
+ return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
+ LD->getExtensionType() == ISD::NON_EXTLOAD;
+ })) {
+ EVT NVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+ SmallVector<SDValue> Ops;
+
+ for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ SDValue V = N->getOperand(i);
+ if (V.isUndef())
+ Ops.push_back(DAG.getUNDEF(MVT::f32));
+ else {
+ LoadSDNode *LD = cast<LoadSDNode>(V);
+ SDValue NewLoad =
+ DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+ Ops.push_back(NewLoad);
+ }
+ }
+ return DAG.getBitcast(N->getValueType(0),
+ DAG.getBuildVector(NVT, dl, Ops));
+ }
+ }
+
+
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
- // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
- // subvectors from the same original vectors. Combine these into a single
- // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
- // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
- // extract_subvector (v16i8 OpB,
- // <0>))),
- // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
- // extract_subvector (v16i8 OpB,
- // <8>)))))
+ // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
+ // extracted subvectors from the same original vectors. Combine these into a
+ // single avg that operates on the two original vectors.
+ // avgceil is the target independant name for rhadd, avgfloor is a hadd.
+ // Example:
+ // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
+ // extract_subvector (v16i8 OpB, <0>))),
+ // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
+ // extract_subvector (v16i8 OpB, <8>)))))
// ->
- // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
+ // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
- (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
- N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
+ (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
+ N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
@@ -14411,6 +15202,29 @@ static SDValue performConcatVectorsCombine(SDNode *N,
}
static SDValue
+performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue V = N->getOperand(0);
+
+ // NOTE: This combine exists in DAGCombiner, but that version's legality check
+ // blocks this combine because the non-const case requires custom lowering.
+ //
+ // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
+ if (V.getOpcode() == ISD::SPLAT_VECTOR)
+ if (isa<ConstantSDNode>(V.getOperand(0)))
+ return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
+
+ return SDValue();
+}
+
+static SDValue
performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
SDLoc DL(N);
@@ -14470,33 +15284,34 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
// Check the operand and see if it originates from a lane extract.
SDValue Op1 = N->getOperand(1);
- if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- // Yep, no additional predication needed. Perform the transform.
- SDValue IID = N->getOperand(0);
- SDValue Shift = N->getOperand(2);
- SDValue Vec = Op1.getOperand(0);
- SDValue Lane = Op1.getOperand(1);
- EVT ResTy = N->getValueType(0);
- EVT VecResTy;
- SDLoc DL(N);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
- // The vector width should be 128 bits by the time we get here, even
- // if it started as 64 bits (the extract_vector handling will have
- // done so).
- assert(Vec.getValueSizeInBits() == 128 &&
- "unexpected vector size on extract_vector_elt!");
- if (Vec.getValueType() == MVT::v4i32)
- VecResTy = MVT::v4f32;
- else if (Vec.getValueType() == MVT::v2i64)
- VecResTy = MVT::v2f64;
- else
- llvm_unreachable("unexpected vector type!");
+ // Yep, no additional predication needed. Perform the transform.
+ SDValue IID = N->getOperand(0);
+ SDValue Shift = N->getOperand(2);
+ SDValue Vec = Op1.getOperand(0);
+ SDValue Lane = Op1.getOperand(1);
+ EVT ResTy = N->getValueType(0);
+ EVT VecResTy;
+ SDLoc DL(N);
- SDValue Convert =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
- }
- return SDValue();
+ // The vector width should be 128 bits by the time we get here, even
+ // if it started as 64 bits (the extract_vector handling will have
+ // done so). Bail if it is not.
+ if (Vec.getValueSizeInBits() != 128)
+ return SDValue();
+
+ if (Vec.getValueType() == MVT::v4i32)
+ VecResTy = MVT::v4f32;
+ else if (Vec.getValueType() == MVT::v2i64)
+ VecResTy = MVT::v2f64;
+ else
+ return SDValue();
+
+ SDValue Convert =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
}
// AArch64 high-vector "long" operations are formed by performing the non-high
@@ -14515,6 +15330,11 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+ MVT VT = N.getSimpleValueType();
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N.getConstantOperandVal(1) == 0)
+ N = N.getOperand(0);
+
switch (N.getOpcode()) {
case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:
@@ -14535,18 +15355,19 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
return SDValue();
}
- MVT NarrowTy = N.getSimpleValueType();
- if (!NarrowTy.is64BitVector())
+ if (!VT.is64BitVector())
return SDValue();
- MVT ElementTy = NarrowTy.getVectorElementType();
- unsigned NumElems = NarrowTy.getVectorNumElements();
- MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+ SDLoc DL(N);
+ unsigned NumElems = VT.getVectorNumElements();
+ if (N.getValueType().is64BitVector()) {
+ MVT ElementTy = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+ N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
+ }
- SDLoc dl(N);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
- DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
- DAG.getConstant(NumElems, dl, MVT::i64));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
+ DAG.getConstant(NumElems, DL, MVT::i64));
}
static bool isEssentiallyExtractHighSubvector(SDValue N) {
@@ -14696,7 +15517,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
}
// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
-static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
// Only scalar integer and vector types.
if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
@@ -14732,6 +15553,81 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(0, DL, MVT::i64));
}
+/// Perform the scalar expression combine in the form of:
+/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
+/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
+static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Handle commutivity.
+ if (LHS.getOpcode() != AArch64ISD::CSEL &&
+ LHS.getOpcode() != AArch64ISD::CSNEG) {
+ std::swap(LHS, RHS);
+ if (LHS.getOpcode() != AArch64ISD::CSEL &&
+ LHS.getOpcode() != AArch64ISD::CSNEG) {
+ return SDValue();
+ }
+ }
+
+ if (!LHS.hasOneUse())
+ return SDValue();
+
+ AArch64CC::CondCode AArch64CC =
+ static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
+
+ // The CSEL should include a const one operand, and the CSNEG should include
+ // One or NegOne operand.
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ if (!CTVal || !CFVal)
+ return SDValue();
+
+ if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
+ (CTVal->isOne() || CFVal->isOne())) &&
+ !(LHS.getOpcode() == AArch64ISD::CSNEG &&
+ (CTVal->isOne() || CFVal->isAllOnes())))
+ return SDValue();
+
+ // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
+ if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
+ !CFVal->isOne()) {
+ std::swap(CTVal, CFVal);
+ AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+ }
+
+ SDLoc DL(N);
+ // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
+ if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
+ !CFVal->isAllOnes()) {
+ APInt C = -1 * CFVal->getAPIntValue();
+ CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
+ CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
+ AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+ }
+
+ // It might be neutral for larger constants, as the immediate need to be
+ // materialized in a register.
+ APInt ADDC = CTVal->getAPIntValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
+ return SDValue();
+
+ assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
+ (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
+ "Unexpected constant value");
+
+ SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
+ SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
+ SDValue Cmp = LHS.getOperand(3);
+
+ return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
+}
+
// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -14755,6 +15651,49 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
Dot.getOperand(2));
}
+static bool isNegatedInteger(SDValue Op) {
+ return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
+}
+
+static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
+}
+
+// Try to fold
+//
+// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
+//
+// The folding helps csel to be matched with csneg without generating
+// redundant neg instruction, which includes negation of the csel expansion
+// of abs node lowered by lowerABS.
+static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
+ if (!isNegatedInteger(SDValue(N, 0)))
+ return SDValue();
+
+ SDValue CSel = N->getOperand(1);
+ if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
+ return SDValue();
+
+ SDValue N0 = CSel.getOperand(0);
+ SDValue N1 = CSel.getOperand(1);
+
+ // If both of them is not negations, it's not worth the folding as it
+ // introduces two additional negations while reducing one negation.
+ if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
+ return SDValue();
+
+ SDValue N0N = getNegatedInteger(N0, DAG);
+ SDValue N1N = getNegatedInteger(N1, DAG);
+
+ SDLoc DL(N);
+ EVT VT = CSel.getValueType();
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
+ CSel.getOperand(3));
+}
+
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
@@ -14808,14 +15747,120 @@ static SDValue performAddSubLongCombine(SDNode *N,
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
}
+static bool isCMP(SDValue Op) {
+ return Op.getOpcode() == AArch64ISD::SUBS &&
+ !Op.getNode()->hasAnyUseOfValue(0);
+}
+
+// (CSEL 1 0 CC Cond) => CC
+// (CSEL 0 1 CC Cond) => !CC
+static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
+ if (Op.getOpcode() != AArch64ISD::CSEL)
+ return None;
+ auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
+ if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+ return None;
+ SDValue OpLHS = Op.getOperand(0);
+ SDValue OpRHS = Op.getOperand(1);
+ if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
+ return CC;
+ if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
+ return getInvertedCondCode(CC);
+
+ return None;
+}
+
+// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
+// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
+static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
+ SDValue CmpOp = Op->getOperand(2);
+ if (!isCMP(CmpOp))
+ return SDValue();
+
+ if (IsAdd) {
+ if (!isOneConstant(CmpOp.getOperand(1)))
+ return SDValue();
+ } else {
+ if (!isNullConstant(CmpOp.getOperand(0)))
+ return SDValue();
+ }
+
+ SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
+ auto CC = getCSETCondCode(CsetOp);
+ if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
+ return SDValue();
+
+ return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
+ Op->getOperand(0), Op->getOperand(1),
+ CsetOp.getOperand(3));
+}
+
+// (ADC x 0 cond) => (CINC x HS cond)
+static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue Cond = N->getOperand(2);
+
+ if (!isNullConstant(RHS))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // (CINC x cc cond) <=> (CSINC x x !cc cond)
+ SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
+ return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
+}
+
+// Transform vector add(zext i8 to i32, zext i8 to i32)
+// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
+// This allows extra uses of saddl/uaddl at the lower vector widths, and less
+// extends.
+static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
+ (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
+ (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
+ N->getOperand(0).getOperand(0).getValueType() !=
+ N->getOperand(1).getOperand(0).getValueType())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0).getOperand(0);
+ SDValue N1 = N->getOperand(1).getOperand(0);
+ EVT InVT = N0.getValueType();
+
+ EVT S1 = InVT.getScalarType();
+ EVT S2 = VT.getScalarType();
+ if ((S2 == MVT::i32 && S1 == MVT::i8) ||
+ (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
+ SDLoc DL(N);
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+ S2.getHalfSizedIntegerVT(*DAG.getContext()),
+ VT.getVectorElementCount());
+ SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
+ SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
+ SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
+ }
+ return SDValue();
+}
+
static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Try to change sum of two reductions.
- if (SDValue Val = performUADDVCombine(N, DAG))
+ if (SDValue Val = performAddUADDVCombine(N, DAG))
return Val;
if (SDValue Val = performAddDotCombine(N, DAG))
return Val;
+ if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
+ return Val;
+ if (SDValue Val = performNegCSelCombine(N, DAG))
+ return Val;
+ if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
+ return Val;
return performAddSubLongCombine(N, DCI, DAG);
}
@@ -15176,6 +16221,9 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
return false;
}
+ if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
+ return true;
+
// "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
// or smaller than the implicit element type represented by N.
// NOTE: A larger element count implies a smaller element type.
@@ -15186,8 +16234,7 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
// If we're compiling for a specific vector-length, we can check if the
// pattern's VL equals that of the scalable vector at runtime.
if (N.getOpcode() == AArch64ISD::PTRUE) {
- const auto &Subtarget =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
if (MaxSVESize && MinSVESize == MaxSVESize) {
@@ -15233,6 +16280,39 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
+ case Intrinsic::get_active_lane_mask: {
+ SDValue Res = SDValue();
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+
+ EVT WhileVT = EVT::getVectorVT(
+ *DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
+
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
+
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
+
+ Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
+ }
+ return Res;
+ }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -15261,7 +16341,11 @@ static SDValue performIntrinsicCombine(SDNode *N,
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
+ return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_umull:
+ return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_sqdmull:
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
@@ -15350,6 +16434,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
case Intrinsic::aarch64_sve_orr:
return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
+ case Intrinsic::aarch64_sve_sabd:
+ return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
+ case Intrinsic::aarch64_sve_uabd:
+ return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
case Intrinsic::aarch64_sve_sqadd:
return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
case Intrinsic::aarch64_sve_sqsub:
@@ -15538,7 +16626,7 @@ static SDValue performExtendCombine(SDNode *N,
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue SplatVal, unsigned NumVecElts) {
assert(!St.isTruncatingStore() && "cannot split truncating vector store");
- unsigned OrigAlignment = St.getAlignment();
+ Align OrigAlignment = St.getAlign();
unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
// Create scalar stores. This is at least as good as the code sequence for a
@@ -15563,7 +16651,7 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
unsigned Offset = EltOffset;
while (--NumVecElts) {
- unsigned Alignment = MinAlign(OrigAlignment, Offset);
+ Align Alignment = commonAlignment(OrigAlignment, Offset);
SDValue OffsetPtr =
DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
@@ -15636,10 +16724,6 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
EVT PtrTy = N->getOperand(3).getValueType();
- if (VT == MVT::nxv8bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
-
EVT LoadVT = VT;
if (VT.isFloatingPoint())
LoadVT = VT.changeTypeToInteger();
@@ -15667,9 +16751,6 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
"Unsupported opcode.");
SDLoc DL(N);
EVT VT = N->getValueType(0);
- if (VT == MVT::nxv8bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
EVT LoadVT = VT;
if (VT.isFloatingPoint())
@@ -15692,10 +16773,6 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
EVT HwSrcVt = getSVEContainerType(DataVT);
SDValue InputVT = DAG.getValueType(DataVT);
- if (DataVT == MVT::nxv8bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
-
if (DataVT.isFloatingPoint())
InputVT = DAG.getValueType(HwSrcVt);
@@ -15722,10 +16799,6 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
EVT DataVT = Data.getValueType();
EVT PtrTy = N->getOperand(4).getValueType();
- if (DataVT == MVT::nxv8bf16 &&
- !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
- return SDValue();
-
if (DataVT.isFloatingPoint())
Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
@@ -15912,8 +16985,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// extensions can use this to mark that it does not want splitting to happen
// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
// eliminating alignment hazards is only 1 in 8 for alignment of 2.
- if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
- S->getAlignment() <= 2)
+ if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
+ S->getAlign() <= Align(2))
return SDValue();
// If we get a splat of a scalar convert this vector store to a store of
@@ -15934,11 +17007,11 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
- S->getAlignment(), S->getMemOperand()->getFlags());
+ S->getAlign(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
- S->getPointerInfo(), S->getAlignment(),
+ S->getPointerInfo(), S->getAlign(),
S->getMemOperand()->getFlags());
}
@@ -15970,6 +17043,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Op1 = N->getOperand(1);
EVT ResVT = N->getValueType(0);
+ // uzp1(x, undef) -> concat(truncate(x), undef)
+ if (Op1.getOpcode() == ISD::UNDEF) {
+ EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ break;
+ case MVT::v16i8:
+ BCVT = MVT::v8i16;
+ HalfVT = MVT::v8i8;
+ break;
+ case MVT::v8i16:
+ BCVT = MVT::v4i32;
+ HalfVT = MVT::v4i16;
+ break;
+ case MVT::v4i32:
+ BCVT = MVT::v2i64;
+ HalfVT = MVT::v2i32;
+ break;
+ }
+ if (BCVT != MVT::Other) {
+ SDValue BC = DAG.getBitcast(BCVT, Op0);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+ DAG.getUNDEF(HalfVT));
+ }
+ }
+
// uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
@@ -16267,6 +17367,152 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
+/// \return true if part of the index was folded into the Base.
+static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
+ SDLoc DL, SelectionDAG &DAG) {
+ // This function assumes a vector of i64 indices.
+ EVT IndexVT = Index.getValueType();
+ if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
+ return false;
+
+ // Simplify:
+ // BasePtr = Ptr
+ // Index = X + splat(Offset)
+ // ->
+ // BasePtr = Ptr + Offset * scale.
+ // Index = X
+ if (Index.getOpcode() == ISD::ADD) {
+ if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
+ Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
+ BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
+ Index = Index.getOperand(0);
+ return true;
+ }
+ }
+
+ // Simplify:
+ // BasePtr = Ptr
+ // Index = (X + splat(Offset)) << splat(Shift)
+ // ->
+ // BasePtr = Ptr + (Offset << Shift) * scale)
+ // Index = X << splat(shift)
+ if (Index.getOpcode() == ISD::SHL &&
+ Index.getOperand(0).getOpcode() == ISD::ADD) {
+ SDValue Add = Index.getOperand(0);
+ SDValue ShiftOp = Index.getOperand(1);
+ SDValue OffsetOp = Add.getOperand(1);
+ if (auto Shift = DAG.getSplatValue(ShiftOp))
+ if (auto Offset = DAG.getSplatValue(OffsetOp)) {
+ Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
+ Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
+ BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
+ Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
+ Add.getOperand(0), ShiftOp);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Analyse the specified address returning true if a more optimal addressing
+// mode is available. When returning true all parameters are updated to reflect
+// their recommended values.
+static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
+ SDValue &BasePtr, SDValue &Index,
+ SelectionDAG &DAG) {
+ // Try to iteratively fold parts of the index into the base pointer to
+ // simplify the index as much as possible.
+ bool Changed = false;
+ while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
+ Changed = true;
+
+ // Only consider element types that are pointer sized as smaller types can
+ // be easily promoted.
+ EVT IndexVT = Index.getValueType();
+ if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
+ return Changed;
+
+ // Match:
+ // Index = step(const)
+ int64_t Stride = 0;
+ if (Index.getOpcode() == ISD::STEP_VECTOR)
+ Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
+
+ // Match:
+ // Index = step(const) << shift(const)
+ else if (Index.getOpcode() == ISD::SHL &&
+ Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
+ SDValue RHS = Index.getOperand(1);
+ if (auto *Shift =
+ dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
+ int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
+ Stride = Step << Shift->getZExtValue();
+ }
+ }
+
+ // Return early because no supported pattern is found.
+ if (Stride == 0)
+ return Changed;
+
+ if (Stride < std::numeric_limits<int32_t>::min() ||
+ Stride > std::numeric_limits<int32_t>::max())
+ return Changed;
+
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ unsigned MaxVScale =
+ Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
+ int64_t LastElementOffset =
+ IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
+
+ if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
+ LastElementOffset > std::numeric_limits<int32_t>::max())
+ return Changed;
+
+ EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
+ // Stride does not scale explicitly by 'Scale', because it happens in
+ // the gather/scatter addressing mode.
+ Index = DAG.getNode(ISD::STEP_VECTOR, SDLoc(N), NewIndexVT,
+ DAG.getTargetConstant(Stride, SDLoc(N), MVT::i32));
+ return true;
+}
+
+static SDValue performMaskedGatherScatterCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
+ assert(MGS && "Can only combine gather load or scatter store nodes");
+
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ SDLoc DL(MGS);
+ SDValue Chain = MGS->getChain();
+ SDValue Scale = MGS->getScale();
+ SDValue Index = MGS->getIndex();
+ SDValue Mask = MGS->getMask();
+ SDValue BasePtr = MGS->getBasePtr();
+ ISD::MemIndexType IndexType = MGS->getIndexType();
+
+ if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
+ return SDValue();
+
+ // Here we catch such cases early and change MGATHER's IndexType to allow
+ // the use of an Index that's more legalisation friendly.
+ if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
+ SDValue PassThru = MGT->getPassThru();
+ SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedGather(
+ DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+ Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
+ }
+ auto *MSC = cast<MaskedScatterSDNode>(MGS);
+ SDValue Data = MSC->getValue();
+ SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
+ Ops, MSC->getMemOperand(), IndexType,
+ MSC->isTruncatingStore());
+}
+
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -16723,6 +17969,47 @@ static SDValue performBRCONDCombine(SDNode *N,
return SDValue();
}
+static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
+ unsigned CC = N->getConstantOperandVal(2);
+ SDValue SUBS = N->getOperand(3);
+ SDValue Zero, CTTZ;
+
+ if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
+ Zero = N->getOperand(0);
+ CTTZ = N->getOperand(1);
+ } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
+ Zero = N->getOperand(1);
+ CTTZ = N->getOperand(0);
+ } else
+ return SDValue();
+
+ if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
+ (CTTZ.getOpcode() == ISD::TRUNCATE &&
+ CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
+ return SDValue();
+
+ assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
+ "Illegal type in CTTZ folding");
+
+ if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
+ return SDValue();
+
+ SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
+ ? CTTZ.getOperand(0).getOperand(0)
+ : CTTZ.getOperand(0);
+
+ if (X != SUBS.getOperand(0))
+ return SDValue();
+
+ unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
+ ? CTTZ.getOperand(0).getValueSizeInBits()
+ : CTTZ.getValueSizeInBits();
+ SDValue BitWidthMinusOne =
+ DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
+ return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
+ BitWidthMinusOne);
+}
+
// Optimize CSEL instructions
static SDValue performCSELCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
@@ -16731,6 +18018,11 @@ static SDValue performCSELCombine(SDNode *N,
if (N->getOperand(0) == N->getOperand(1))
return N->getOperand(0);
+ // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
+ // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
+ if (SDValue Folded = foldCSELofCTTZ(N, DAG))
+ return Folded;
+
return performCONDCombine(N, DCI, DAG, 2, 3);
}
@@ -16739,14 +18031,14 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
if (Cond == ISD::SETNE && isOneConstant(RHS) &&
LHS->getOpcode() == AArch64ISD::CSEL &&
isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
LHS->hasOneUse()) {
- SDLoc DL(N);
-
// Invert CSEL's condition.
auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
@@ -16757,9 +18049,48 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
LHS.getOperand(3));
- return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
+ return DAG.getZExtOrTrunc(CSEL, DL, VT);
}
+ // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
+ if (Cond == ISD::SETNE && isNullConstant(RHS) &&
+ LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
+ LHS->hasOneUse()) {
+ EVT TstVT = LHS->getValueType(0);
+ if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
+ // this pattern will get better opt in emitComparison
+ uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
+ SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
+ DAG.getConstant(TstImm, DL, TstVT));
+ return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
+ }
+ }
+
+ return SDValue();
+}
+
+// Replace a flag-setting operator (eg ANDS) with the generic version
+// (eg AND) if the flag is unused.
+static SDValue performFlagSettingCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ unsigned GenericOpcode) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // If the flag result isn't used, convert back to a generic opcode.
+ if (!N->hasAnyUseOfValue(1)) {
+ SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
+ return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
+ DL);
+ }
+
+ // Combine identical generic nodes into this node, re-using the result.
+ if (SDNode *Generic = DCI.DAG.getNodeIfExists(
+ GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+ DCI.CombineTo(Generic, SDValue(N, 0));
+
return SDValue();
}
@@ -16801,27 +18132,46 @@ static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue
+performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
"Unexpected opcode!");
+ SelectionDAG &DAG = DCI.DAG;
SDValue Pred = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
- // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
- // => inner setcc_merge_zero
- if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
- LHS->getOpcode() == ISD::SIGN_EXTEND &&
- LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
- LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
- LHS->getOperand(0)->getOperand(0) == Pred)
- return LHS->getOperand(0);
-
if (SDValue V = performSetCCPunpkCombine(N, DAG))
return V;
+ if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+ LHS->getOpcode() == ISD::SIGN_EXTEND &&
+ LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
+ // setcc_merge_zero(
+ // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
+ // => setcc_merge_zero(pred, ...)
+ if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+ LHS->getOperand(0)->getOperand(0) == Pred)
+ return LHS->getOperand(0);
+
+ // setcc_merge_zero(
+ // all_active, extend(nxvNi1 ...), != splat(0))
+ // -> nxvNi1 ...
+ if (isAllActivePredicate(DAG, Pred))
+ return LHS->getOperand(0);
+
+ // setcc_merge_zero(
+ // pred, extend(nxvNi1 ...), != splat(0))
+ // -> nxvNi1 and(pred, ...)
+ if (DCI.isAfterLegalizeDAG())
+ // Do this after legalization to allow more folds on setcc_merge_zero
+ // to be recognized.
+ return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+ LHS->getOperand(0), Pred);
+ }
+
return SDValue();
}
@@ -16928,12 +18278,53 @@ static SDValue performTBZCombine(SDNode *N,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
}
+// Swap vselect operands where it may allow a predicated operation to achieve
+// the `sel`.
+//
+// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
+// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
+static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
+ auto SelectA = N->getOperand(1);
+ auto SelectB = N->getOperand(2);
+ auto NTy = N->getValueType(0);
+
+ if (!NTy.isScalableVector())
+ return SDValue();
+ SDValue SetCC = N->getOperand(0);
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
+ return SDValue();
+
+ switch (SelectB.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::FMUL:
+ case ISD::FSUB:
+ case ISD::FADD:
+ break;
+ }
+ if (SelectA != SelectB.getOperand(0))
+ return SDValue();
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ ISD::CondCode InverseCC =
+ ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
+ auto InverseSetCC =
+ DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
+ SetCC.getOperand(1), InverseCC);
+
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
+ {InverseSetCC, SelectB, SelectA});
+}
+
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+ if (auto SwapResult = trySwapVSelectOperands(N, DAG))
+ return SwapResult;
+
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
@@ -17064,6 +18455,24 @@ static SDValue performSelectCombine(SDNode *N,
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
+static SDValue performDUPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
+ // 128bit vector version.
+ if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
+ EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
+ if (SDNode *LN = DCI.DAG.getNodeIfExists(
+ N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
+ DCI.DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
+ return performPostLD1Combine(N, DCI, false);
+}
+
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -17104,13 +18513,14 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
// Check whether folding this offset is legal. It must not go out of bounds of
// the referenced object to avoid violating the code model, and must be
- // smaller than 2^21 because this is the largest offset expressible in all
- // object formats.
+ // smaller than 2^20 because this is the largest offset expressible in all
+ // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
+ // stores an immediate signed 21 bit offset.)
//
// This check also prevents us from folding negative offsets, which will end
// up being treated in the same way as large positive ones. They could also
// cause code model violations, and aren't really common enough to matter.
- if (Offset >= (1 << 21))
+ if (Offset >= (1 << 20))
return SDValue();
const GlobalValue *GV = GN->getGlobal();
@@ -17621,7 +19031,7 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return performPostLD1Combine(N, DCI, true);
}
-SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
EVT Ty = N->getValueType(0);
if (Ty.isInteger())
return SDValue();
@@ -17643,9 +19053,9 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getBitcast(Ty, Trunc);
}
-SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
+static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -17675,6 +19085,31 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget,
+ bool fixedSVEVectorVT) {
+ EVT VT = N->getValueType(0);
+
+ // Don't expand for SVE2
+ if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
+ return SDValue();
+
+ // Don't expand for NEON
+ if (VT.isFixedLengthVector() && !fixedSVEVectorVT)
+ return SDValue();
+
+ SDLoc DL(N);
+
+ SDValue Mask = N->getOperand(0);
+ SDValue In1 = N->getOperand(1);
+ SDValue In2 = N->getOperand(2);
+
+ SDValue InvMask = DAG.getNOT(DL, Mask, VT);
+ SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
+ SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
+ return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -17685,6 +19120,22 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ADD:
case ISD::SUB:
return performAddSubCombine(N, DCI, DAG);
+ case AArch64ISD::ANDS:
+ return performFlagSettingCombine(N, DCI, ISD::AND);
+ case AArch64ISD::ADC:
+ if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
+ return R;
+ return foldADCToCINC(N, DAG);
+ case AArch64ISD::SBC:
+ return foldOverflowCheck(N, DAG, /* IsAdd */ false);
+ case AArch64ISD::ADCS:
+ if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
+ return R;
+ return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
+ case AArch64ISD::SBCS:
+ if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
+ return R;
+ return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
@@ -17711,10 +19162,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performExtendCombine(N, DCI, DAG);
case ISD::SIGN_EXTEND_INREG:
return performSignExtendInRegCombine(N, DCI, DAG);
- case ISD::TRUNCATE:
- return performVectorTruncateCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
+ case ISD::EXTRACT_SUBVECTOR:
+ return performExtractSubvectorCombine(N, DCI, DAG);
case ISD::INSERT_SUBVECTOR:
return performInsertSubvectorCombine(N, DCI, DAG);
case ISD::SELECT:
@@ -17729,6 +19180,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
+ case ISD::MGATHER:
+ case ISD::MSCATTER:
+ return performMaskedGatherScatterCombine(N, DCI, DAG);
case ISD::VECTOR_SPLICE:
return performSVESpliceCombine(N, DAG);
case ISD::FP_EXTEND:
@@ -17741,7 +19195,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::CSEL:
return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:
- return performPostLD1Combine(N, DCI, false);
+ return performDUPCombine(N, DCI);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case AArch64ISD::SPLICE:
@@ -17752,7 +19206,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::UZP1:
return performUzpCombine(N, DAG);
case AArch64ISD::SETCC_MERGE_ZERO:
- return performSetccMergeZeroCombine(N, DAG);
+ return performSetccMergeZeroCombine(N, DCI);
case AArch64ISD::GLD1_MERGE_ZERO:
case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
@@ -17773,12 +19227,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performVectorShiftCombine(N, *this, DCI);
case AArch64ISD::SUNPKLO:
return performSunpkloCombine(N, DAG);
+ case AArch64ISD::BSP:
+ return performBSPExpandForSVE(
+ N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0)));
case ISD::INSERT_VECTOR_ELT:
return performInsertVectorEltCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
- return performExtractVectorEltCombine(N, DAG);
+ return performExtractVectorEltCombine(N, DCI, Subtarget);
case ISD::VECREDUCE_ADD:
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+ case AArch64ISD::UADDV:
+ return performUADDVCombine(N, DAG);
+ case AArch64ISD::SMULL:
+ case AArch64ISD::UMULL:
+ return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -18152,6 +19614,15 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
"Expected fp->int bitcast!");
+
+ // Bitcasting between unpacked vector types of different element counts is
+ // not a NOP because the live elements are laid out differently.
+ // 01234567
+ // e.g. nxv2i32 = XX??XX??
+ // nxv4f16 = X?X?X?X?
+ if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
+ return;
+
SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
return;
@@ -18169,6 +19640,53 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
}
+static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector() ||
+ (VT.getScalarType().isFloatingPoint() &&
+ !N->getFlags().hasAllowReassociation()) ||
+ (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
+ return;
+
+ SDValue X = N->getOperand(0);
+ auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
+ if (!Shuf) {
+ Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
+ X = N->getOperand(1);
+ if (!Shuf)
+ return;
+ }
+
+ if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
+ return;
+
+ // Check the mask is 1,0,3,2,5,4,...
+ ArrayRef<int> Mask = Shuf->getMask();
+ for (int I = 0, E = Mask.size(); I < E; I++)
+ if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
+ return;
+
+ SDLoc DL(N);
+ auto LoHi = DAG.SplitVector(X, DL);
+ assert(LoHi.first.getValueType() == LoHi.second.getValueType());
+ SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
+ LoHi.first, LoHi.second);
+
+ // Shuffle the elements back into order.
+ SmallVector<int> NMask;
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
+ NMask.push_back(I);
+ NMask.push_back(I);
+ }
+ Results.push_back(
+ DAG.getVectorShuffle(VT, DL,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
+ DAG.getUNDEF(LoHi.first.getValueType())),
+ DAG.getUNDEF(VT), NMask));
+}
+
static void ReplaceReductionResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG, unsigned InterOp,
@@ -18346,6 +19864,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::VECREDUCE_UMIN:
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
return;
+ case ISD::ADD:
+ case ISD::FADD:
+ ReplaceAddWithADDP(N, Results, DAG, Subtarget);
+ return;
case ISD::CTPOP:
if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
@@ -18406,8 +19928,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
ReplaceExtractSubVectorResults(N, Results, DAG);
return;
case ISD::INSERT_SUBVECTOR:
- // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
- // to common code for result type legalisation
+ case ISD::CONCAT_VECTORS:
+ // Custom lowering has been requested for INSERT_SUBVECTOR and
+ // CONCAT_VECTORS -- but delegate to common code for result type
+ // legalisation
return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
@@ -18485,11 +20009,11 @@ bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
if (auto LI = dyn_cast<LoadInst>(I))
return LI->getType()->getPrimitiveSizeInBits() == 128 &&
- LI->getAlignment() >= 16;
+ LI->getAlign() >= Align(16);
if (auto SI = dyn_cast<StoreInst>(I))
return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
- SI->getAlignment() >= 16;
+ SI->getAlign() >= Align(16);
return false;
}
@@ -18502,12 +20026,12 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic(
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
-bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLoweringBase::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
- if (Size != 128)
- return false;
-
- return !isOpSuitableForLDPSTP(SI);
+ if (Size != 128 || isOpSuitableForLDPSTP(SI))
+ return AtomicExpansionKind::None;
+ return AtomicExpansionKind::Expand;
}
// Loads and stores less than 128-bits are already atomic; ones above that
@@ -18627,7 +20151,10 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
const DataLayout &DL = M->getDataLayout();
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
- Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
+ CallInst *CI = Builder.CreateCall(Ldxr, Addr);
+ CI->addParamAttr(
+ 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+ Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
return Builder.CreateBitCast(Trunc, ValueTy);
}
@@ -18668,10 +20195,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
Val = Builder.CreateBitCast(Val, IntValTy);
- return Builder.CreateCall(Stxr,
- {Builder.CreateZExtOrBitCast(
- Val, Stxr->getFunctionType()->getParamType(0)),
- Addr});
+ CallInst *CI = Builder.CreateCall(
+ Stxr, {Builder.CreateZExtOrBitCast(
+ Val, Stxr->getFunctionType()->getParamType(0)),
+ Addr});
+ CI->addParamAttr(1, Attribute::get(Builder.getContext(),
+ Attribute::ElementType, Val->getType()));
+ return CI;
}
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
@@ -18993,8 +20523,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
// For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
// AArch64SVEPredPattern::all, which can enable the use of unpredicated
// variants of instructions when available.
- const auto &Subtarget =
- static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
if (MaxSVESize && MinSVESize == MaxSVESize &&
@@ -19080,22 +20609,23 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
MemVT = MemVT.changeTypeToInteger();
}
- auto NewLoad = DAG.getMaskedLoad(
+ SDValue NewLoad = DAG.getMaskedLoad(
LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
Load->getAddressingMode(), Load->getExtensionType());
+ SDValue Result = NewLoad;
if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
EVT ExtendVT = ContainerVT.changeVectorElementType(
Load->getMemoryVT().getVectorElementType());
- NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
- NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, NewLoad, DAG.getUNDEF(ContainerVT));
+ Result = getSVESafeBitCast(ExtendVT, Result, DAG);
+ Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, Result, DAG.getUNDEF(ContainerVT));
}
- auto Result = convertFromScalableVector(DAG, VT, NewLoad);
- SDValue MergedValues[2] = {Result, Load->getChain()};
+ Result = convertFromScalableVector(DAG, VT, Result);
+ SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
return DAG.getMergeValues(MergedValues, DL);
}
@@ -19143,19 +20673,20 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
IsPassThruZeroOrUndef = true;
}
- auto NewLoad = DAG.getMaskedLoad(
+ SDValue NewLoad = DAG.getMaskedLoad(
ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
Load->getAddressingMode(), Load->getExtensionType());
+ SDValue Result = NewLoad;
if (!IsPassThruZeroOrUndef) {
SDValue OldPassThru =
convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
- NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);
+ Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
}
- auto Result = convertFromScalableVector(DAG, VT, NewLoad);
- SDValue MergedValues[2] = {Result, Load->getChain()};
+ Result = convertFromScalableVector(DAG, VT, Result);
+ SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
return DAG.getMergeValues(MergedValues, DL);
}
@@ -19232,7 +20763,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
// Scalable vector i32/i64 DIV is supported.
if (EltVT == MVT::i32 || EltVT == MVT::i64)
- return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
+ return LowerToPredicatedOp(Op, DAG, PredOpcode);
// Scalable vector i8/i16 DIV is not supported. Promote it to i32.
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
@@ -19387,13 +20918,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
// NOTE: The results for inactive lanes are undefined.
SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
SelectionDAG &DAG,
- unsigned NewOp,
- bool OverrideNEON) const {
+ unsigned NewOp) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
auto Pg = getPredicateForVector(DAG, DL, VT);
- if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
+ if (VT.isFixedLengthVector()) {
+ assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
// Create list of operands by converting existing ones to scalable types.
@@ -19411,8 +20942,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
continue;
}
- assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
- "Only fixed length vectors are supported!");
+ assert(isTypeLegal(V.getValueType()) &&
+ "Expected only legal fixed-width types");
Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
}
@@ -19543,7 +21074,9 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
SDValue VecOp = ScalarOp.getOperand(0);
EVT SrcVT = VecOp.getValueType();
- if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
+ if (useSVEForFixedLengthVectorVT(
+ SrcVT,
+ /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
}
@@ -19950,6 +21483,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
+ // Safe bitcasting between unpacked vector types of different element counts
+ // is currently unsupported because the following is missing the necessary
+ // work to ensure the result's elements live where they're supposed to within
+ // an SVE register.
+ // 01234567
+ // e.g. nxv2i32 = XX??XX??
+ // nxv4f16 = X?X?X?X?
+ assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
+ VT == PackedVT || InVT == PackedInVT) &&
+ "Unexpected bitcast!");
+
// Pack input if required.
if (InVT != PackedInVT)
Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
@@ -20016,6 +21560,13 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
+bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
+ return Op.getOpcode() == AArch64ISD::DUP ||
+ (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
+ TargetLowering::isTargetCanonicalConstantNode(Op);
+}
+
bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
unsigned Opc, LLT Ty1, LLT Ty2) const {
return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2138c0ffe70a..06ea918ea32e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -55,6 +55,8 @@ enum NodeType : unsigned {
// x29, x29` marker instruction.
CALL_RVMARKER,
+ CALL_BTI, // Function call followed by a BTI instruction.
+
// Produces the full sequence of instructions for getting the thread pointer
// offset of a variable into X0, using the TLSDesc model.
TLSDESC_CALLSEQ,
@@ -79,7 +81,6 @@ enum NodeType : unsigned {
// Predicated instructions where inactive lanes produce undefined results.
ABDS_PRED,
ABDU_PRED,
- ADD_PRED,
FADD_PRED,
FDIV_PRED,
FMA_PRED,
@@ -98,7 +99,6 @@ enum NodeType : unsigned {
SMIN_PRED,
SRA_PRED,
SRL_PRED,
- SUB_PRED,
UDIV_PRED,
UMAX_PRED,
UMIN_PRED,
@@ -158,6 +158,7 @@ enum NodeType : unsigned {
DUPLANE16,
DUPLANE32,
DUPLANE64,
+ DUPLANE128,
// Vector immedate moves
MOVI,
@@ -232,15 +233,10 @@ enum NodeType : unsigned {
SADDV,
UADDV,
- // Vector halving addition
- SHADD,
- UHADD,
-
- // Vector rounding halving addition
- SRHADD,
- URHADD,
-
- // Unsigned Add Long Pairwise
+ // Add Pairwise of two vectors
+ ADDP,
+ // Add Long Pairwise
+ SADDLP,
UADDLP,
// udot/sdot instructions
@@ -411,6 +407,10 @@ enum NodeType : unsigned {
SSTNT1_PRED,
SSTNT1_INDEX_PRED,
+ // SME
+ RDSVL,
+ REVD_MERGE_PASSTHRU,
+
// Asserts that a function argument (i32) is zero-extended to i8 by
// the caller
ASSERT_ZEXT_BOOL,
@@ -462,23 +462,6 @@ enum NodeType : unsigned {
} // end namespace AArch64ISD
-namespace {
-
-// Any instruction that defines a 32-bit result zeros out the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
-// 32 bits, they're probably just qualifying a CopyFromReg.
-static inline bool isDef32(const SDNode &N) {
- unsigned Opc = N.getOpcode();
- return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
- Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
- Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
- Opc != ISD::FREEZE;
-}
-
-} // end anonymous namespace
-
namespace AArch64 {
/// Possible values of current rounding mode, which is specified in bits
/// 23:22 of FPCR.
@@ -501,6 +484,11 @@ public:
explicit AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI);
+ /// Control the following reassociation of operands: (op (op x, c1), y) -> (op
+ /// (op x, y), c1) where N0 is (op x, c1) and N1 is y.
+ bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const override;
+
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
@@ -573,6 +561,17 @@ public:
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
@@ -610,8 +609,8 @@ public:
bool isLegalAddImmediate(int64_t) const override;
bool isLegalICmpImmediate(int64_t) const override;
- bool isMulAddWithConstProfitable(const SDValue &AddNode,
- const SDValue &ConstNode) const override;
+ bool isMulAddWithConstProfitable(SDValue AddNode,
+ SDValue ConstNode) const override;
bool shouldConsiderGEPOffsetSplit() const override;
@@ -651,6 +650,10 @@ public:
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
+ /// Return true if it is profitable to fold a pair of shifts into a mask.
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -680,7 +683,8 @@ public:
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
- bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
@@ -898,11 +902,8 @@ private:
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
- bool isEligibleForTailCallOptimization(
- SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+ bool
+ isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const;
/// Finds the incoming stack arguments which overlap the given fixed stack
/// object and incorporates their load into the current chain. This prevents
@@ -980,8 +981,8 @@ private:
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
- bool OverrideNEON = false) const;
+ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOp) const;
SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -1052,6 +1053,8 @@ private:
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
+ SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &ExtraSteps, bool &UseOneConst,
bool Reciprocal) const override;
@@ -1093,7 +1096,7 @@ private:
}
bool shouldExtendGSIndex(EVT VT, EVT &EltTy) const override;
- bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+ bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
@@ -1129,6 +1132,8 @@ private:
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ bool isTargetCanonicalConstantNode(SDValue Op) const override;
+
// Normally SVE is only used for byte size vectors that do not fit within a
// NEON vector. This changes when OverrideNEON is true, allowing SVE to be
// used for 64bit and 128bit vectors as well.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index b220929514f9..c477a44b13b2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -27,22 +27,43 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>;
// supported, but when they're relaxed and anything can be used, all the
// standard modes would be valid and may give efficiency gains.
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr)> {
+ let IsAtomic = 1;
+ let IsAtomicOrderingAcquireOrStronger = 0;
+}
+
// A atomic load operation that actually needs acquire semantics.
class acquiring_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr)> {
let IsAtomic = 1;
- let IsAtomicOrderingAcquireOrStronger = 1;
+ let IsAtomicOrderingAcquire = 1;
}
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
+// An atomic load operation that needs sequential consistency.
+class seq_cst_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr)> {
let IsAtomic = 1;
- let IsAtomicOrderingAcquireOrStronger = 0;
+ let IsAtomicOrderingSequentiallyConsistent = 1;
+}
+
+// RCPC extension, currently opt-in under a separate feature.
+let Predicates = [HasLDAPR] in {
+ // v8.3 Release Consistent Processor Consistent support, optional in v8.2.
+ // 8-bit loads
+ def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDAPRB GPR64sp:$ptr)>;
+ // 16-bit loads
+ def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDAPRH GPR64sp:$ptr)>;
+ // 32-bit loads
+ def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDAPRW GPR64sp:$ptr)>;
+ // 64-bit loads
+ def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDAPRX GPR64sp:$ptr)>;
}
// 8-bit loads
+def : Pat<(seq_cst_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend8:$offset)),
@@ -58,6 +79,7 @@ def : Pat<(relaxed_load<atomic_load_8>
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
// 16-bit loads
+def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend16:$extend)),
@@ -73,6 +95,7 @@ def : Pat<(relaxed_load<atomic_load_16>
(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
// 32-bit loads
+def : Pat<(seq_cst_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend32:$extend)),
@@ -88,6 +111,7 @@ def : Pat<(relaxed_load<atomic_load_32>
(LDURWi GPR64sp:$Rn, simm9:$offset)>;
// 64-bit loads
+def : Pat<(seq_cst_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend64:$extend)),
@@ -490,7 +514,8 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
mayLoad = 1, mayStore = 1 in {
-class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch),
+class cmp_swap_128 : Pseudo<(outs GPR64common:$RdLo, GPR64common:$RdHi,
+ GPR32common:$scratch),
(ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
GPR64:$newLo, GPR64:$newHi), []>,
Sched<[WriteAtomic]>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4c1e41b7efee..78bc1b8c6f02 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -109,15 +109,19 @@ class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>;
-// Helper fragment for an extract of the high portion of a 128-bit vector.
+// Helper fragment for an extract of the high portion of a 128-bit vector. The
+// ComplexPattern match both extract_subvector and bitcast(extract_subvector(..)).
def extract_high_v16i8 :
- UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+ ComplexPattern<v8i8, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
def extract_high_v8i16 :
- UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+ ComplexPattern<v4i16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
def extract_high_v4i32 :
- UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
- UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+ ComplexPattern<v2i32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+
+def extract_high_dup_v8i16 :
+ BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>;
+def extract_high_dup_v4i32 :
+ BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>;
//===----------------------------------------------------------------------===//
// Asm Operand Classes.
@@ -1178,6 +1182,13 @@ def fpimm32XForm : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>;
+def fpimm32SIMDModImmType4XForm : SDNodeXForm<fpimm, [{
+ uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType4(N->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue());
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>;
+
def fpimm64XForm : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP64Imm(InVal);
@@ -1199,6 +1210,13 @@ def fpimm32 : Operand<f32>,
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
}
+
+def fpimm32SIMDModImmType4 : FPImmLeaf<f32, [{
+ uint64_t Enc = Imm.bitcastToAPInt().getZExtValue();
+ return Enc != 0 && AArch64_AM::isAdvSIMDModImmType4(Enc << 32 | Enc);
+ }], fpimm32SIMDModImmType4XForm> {
+}
+
def fpimm64 : Operand<f64>,
FPImmLeaf<f64, [{
return AArch64_AM::getFP64Imm(Imm) != -1;
@@ -1234,6 +1252,9 @@ def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">,
GISDNodeXFormEquiv<fpimm32XForm>;
def gi_fpimm64 : GICustomOperandRenderer<"renderFPImm64">,
GISDNodeXFormEquiv<fpimm64XForm>;
+def gi_fpimm32SIMDModImmType4 :
+ GICustomOperandRenderer<"renderFPImm32SIMDModImmType4">,
+ GISDNodeXFormEquiv<fpimm32SIMDModImmType4XForm>;
// Vector lane operands
class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
@@ -1261,8 +1282,12 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>;
def VectorIndexSOperand : AsmVectorIndex<0, 3>;
def VectorIndexDOperand : AsmVectorIndex<0, 1>;
-defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand,
+let OperandNamespace = "AArch64" in {
+ let OperandType = "OPERAND_IMPLICIT_IMM_0" in {
+ defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand,
[{ return ((uint64_t)Imm) == 0; }]>;
+ }
+}
defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
[{ return ((uint64_t)Imm) == 1; }]>;
defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
@@ -1312,6 +1337,8 @@ def sme_elm_idx0_0 : Operand<i64>, ImmLeaf<i64, [{
}]> {
let ParserMatchClass = Imm0_0Operand;
let PrintMethod = "printMatrixIndex";
+ let OperandNamespace = "AArch64";
+ let OperandType = "OPERAND_IMPLICIT_IMM_0";
}
def sme_elm_idx0_1 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) <= 1;
@@ -4512,8 +4539,9 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
//---
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
- : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", []>,
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm,
+ list<dag> pattern = []>
+ : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", pattern>,
Sched<[WriteSys]> {
bits<16> imm;
let Inst{31-24} = 0b11010100;
@@ -4542,6 +4570,7 @@ let Predicates = [HasFPARMv8] in {
// Floating point to integer conversion
//---
+let mayRaiseFPException = 1 in
class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterClass dstType,
string asm, list<dag> pattern>
@@ -4561,7 +4590,7 @@ class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
let Inst{4-0} = Rd;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterClass dstType,
Operand immType, string asm, list<dag> pattern>
@@ -4683,7 +4712,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
// Integer to floating point conversion
//---
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseIntegerToFP<bit isUnsigned,
RegisterClass srcType, RegisterClass dstType,
Operand immType, string asm, list<dag> pattern>
@@ -4701,6 +4730,7 @@ class BaseIntegerToFP<bit isUnsigned,
let Inst{4-0} = Rd;
}
+let mayRaiseFPException = 1 in
class BaseIntegerToFPUnscaled<bit isUnsigned,
RegisterClass srcType, RegisterClass dstType,
ValueType dvt, string asm, SDPatternOperator node>
@@ -4937,6 +4967,7 @@ multiclass UnscaledConversion<string asm> {
// Floating point conversion
//---
+let mayRaiseFPException = 1 in
class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
RegisterClass srcType, string asm, list<dag> pattern>
: I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
@@ -4963,15 +4994,15 @@ multiclass FPConversion<string asm> {
// Half-precision to Double-precision
def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
- [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+ [(set FPR64:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
// Half-precision to Single-precision
def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
- [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+ [(set FPR32:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
// Single-precision to Double-precision
def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
- [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+ [(set FPR64:$Rd, (any_fpextend FPR32:$Rn))]>;
// Single-precision to Half-precision
def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
@@ -4999,8 +5030,9 @@ class BaseSingleOperandFPData<bits<6> opcode, RegisterClass regtype,
}
multiclass SingleOperandFPData<bits<4> opcode, string asm,
- SDPatternOperator node = null_frag> {
-
+ SDPatternOperator node = null_frag,
+ int fpexceptions = 1> {
+ let mayRaiseFPException = fpexceptions in {
def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
@@ -5013,8 +5045,14 @@ multiclass SingleOperandFPData<bits<4> opcode, string asm,
def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> {
let Inst{23-22} = 0b01; // 64-bit size flag
}
+ }
}
+multiclass SingleOperandFPDataNoException<bits<4> opcode, string asm,
+ SDPatternOperator node = null_frag>
+ : SingleOperandFPData<opcode, asm, node, 0>;
+
+let mayRaiseFPException = 1 in
multiclass SingleOperandFPNo16<bits<6> opcode, string asm,
SDPatternOperator node = null_frag>{
@@ -5035,7 +5073,7 @@ multiclass FRIntNNT<bits<2> opcode, string asm, SDPatternOperator node = null_fr
// Two operand floating point data processing
//---
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
@@ -5075,7 +5113,8 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
}
}
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm,
+ SDPatternOperator node> {
def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
[(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
@@ -5098,6 +5137,7 @@ multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
// Three operand floating point data processing
//---
+let mayRaiseFPException = 1 in
class BaseThreeOperandFPData<bit isNegated, bit isSub,
RegisterClass regtype, string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
@@ -5142,7 +5182,7 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
// Floating point data comparisons
//---
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseOneOperandFPComparison<bit signalAllNans,
RegisterClass regtype, string asm,
list<dag> pat>
@@ -5161,7 +5201,7 @@ class BaseOneOperandFPComparison<bit signalAllNans,
let PostEncoderMethod = "fixOneOperandFPComparison";
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
string asm, list<dag> pat>
: I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
@@ -5218,7 +5258,7 @@ multiclass FPComparison<bit signalAllNans, string asm,
// Floating point conditional comparisons
//---
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
string mnemonic, list<dag> pat>
: I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
@@ -5544,6 +5584,7 @@ multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
}
// As above, but only floating point elements supported.
+let mayRaiseFPException = 1 in
multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -5565,6 +5606,7 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
string asm,
SDPatternOperator OpNode> {
@@ -5587,6 +5629,7 @@ multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -5614,6 +5657,7 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
}
// As above, but D and B sized elements unsupported.
+let mayRaiseFPException = 1 in
multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
@@ -5718,6 +5762,7 @@ multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperato
// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
// 8H to 4S, when Q=1).
+let mayRaiseFPException = 1 in
class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
@@ -5986,7 +6031,9 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
// Supports H, S and D element sizes, uses high bit of the size field
// as an extra opcode bit.
multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode,
+ int fpexceptions = 1> {
+ let mayRaiseFPException = fpexceptions in {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
@@ -6004,9 +6051,15 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+ }
}
+multiclass SIMDTwoVectorFPNoException<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpNode>
+ : SIMDTwoVectorFP<U, S, opc, asm, OpNode, 0>;
+
// Supports only S and D element sizes
+let mayRaiseFPException = 1 in
multiclass SIMDTwoVectorSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
@@ -6036,7 +6089,7 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
-
+let mayRaiseFPException = 1 in
multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -6058,6 +6111,7 @@ multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -6209,6 +6263,7 @@ multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
string asm, SDNode OpNode> {
+ let mayRaiseFPException = 1 in {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
asm, ".4h", "0.0",
@@ -6226,6 +6281,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
asm, ".2d", "0.0",
v2i64, v2f64, OpNode>;
+ }
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
@@ -6253,7 +6309,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand outtype, RegisterOperand intype,
string asm, string VdTy, string VnTy,
@@ -6275,7 +6331,7 @@ class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{4-0} = Rd;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand outtype, RegisterOperand intype,
string asm, string VdTy, string VnTy,
@@ -6457,8 +6513,8 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
asm#"2", ".1q", ".2d", ".2d", []>;
}
- def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
- (v8i8 (extract_high_v16i8 V128:$Rm)))),
+ def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+ (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
(!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
}
@@ -6471,8 +6527,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
- [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))]>;
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6480,8 +6536,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
- [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))]>;
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
@@ -6495,8 +6551,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm)))))]>;
+ (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+ (extract_high_v16i8 (v16i8 V128:$Rm))))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
@@ -6506,8 +6562,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))))]>;
+ (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6517,8 +6573,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))))]>;
+ (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
}
multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
@@ -6535,8 +6591,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$dst),
(add (v8i16 V128:$Rd),
- (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm))))))]>;
+ (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+ (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
@@ -6548,8 +6604,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(add (v4i32 V128:$Rd),
- (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm))))))]>;
+ (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6561,8 +6617,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(add (v2i64 V128:$Rd),
- (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm))))))]>;
+ (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
}
multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
@@ -6574,8 +6630,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
- [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm)))]>;
+ [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)),
+ (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
@@ -6583,8 +6639,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
- [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))]>;
+ [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6592,8 +6648,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
- [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))]>;
+ [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
@@ -6609,8 +6665,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd),
- (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm)))]>;
+ (extract_high_v16i8 (v16i8 V128:$Rn)),
+ (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
@@ -6621,8 +6677,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd),
- (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))]>;
+ (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6633,8 +6689,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd),
- (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))]>;
+ (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
@@ -6651,8 +6707,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
- (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))))]>;
+ (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_v8i16 (v8i16 V128:$Rm))))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
@@ -6665,8 +6721,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
- (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))))]>;
+ (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_v4i32 (v4i32 V128:$Rm))))))]>;
}
multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
@@ -6679,7 +6735,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".8h", ".8h", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
- (extract_high_v16i8 V128:$Rm)))]>;
+ (extract_high_v16i8 (v16i8 V128:$Rm))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V128, V64,
asm, ".4s", ".4s", ".4h",
@@ -6688,7 +6744,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".4s", ".4s", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
- (extract_high_v8i16 V128:$Rm)))]>;
+ (extract_high_v8i16 (v8i16 V128:$Rm))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V128, V64,
asm, ".2d", ".2d", ".2s",
@@ -6697,7 +6753,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
V128, V128, V128,
asm#"2", ".2d", ".2d", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
- (extract_high_v4i32 V128:$Rm)))]>;
+ (extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
//----------------------------------------------------------------------------
@@ -6876,7 +6932,7 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> {
multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag,
Predicate pred = HasNEON> {
- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in {
let Predicates = [pred] in {
def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
@@ -6895,7 +6951,7 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in {
def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
@@ -7025,6 +7081,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
let Inst{4-0} = Rd;
}
+let mayRaiseFPException = 1 in
class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
: I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
[(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
@@ -7048,11 +7105,13 @@ multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
+ let mayRaiseFPException = 1 in {
def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
}
+ }
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
@@ -7076,6 +7135,7 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
(!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
Predicate pred = HasNEON> {
let Predicates = [pred] in {
@@ -7087,6 +7147,7 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
}
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
@@ -7169,6 +7230,7 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
asm, ".2d">;
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
let Predicates = [HasNEON, HasFullFP16] in {
def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
@@ -7232,6 +7294,7 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
asm, ".4s", []>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
Intrinsic intOp> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -7351,7 +7414,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
multiclass SMov {
// SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorStreamingSVE] in {
+ let Predicates = [HasNEONorSME] in {
def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -7398,7 +7461,7 @@ multiclass SMov {
multiclass UMov {
// UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorStreamingSVE] in {
+ let Predicates = [HasNEONorSME] in {
def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -8048,6 +8111,7 @@ multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
".2h", V128, v4f32, v8bf16>;
}
+let mayRaiseFPException = 1 in
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
@@ -8056,6 +8120,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
}
+let mayRaiseFPException = 1 in
class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
: I<(outs V128:$dst),
(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
@@ -8095,18 +8160,21 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
", $Rm", ".8h", "}");
}
+let mayRaiseFPException = 1 in
class SIMD_BFCVTN
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
"bfcvtn", ".4h", ".4s",
[(set (v8bf16 V128:$Rd),
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+let mayRaiseFPException = 1 in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
"bfcvtn2", ".8h", ".4s",
[(set (v8bf16 V128:$dst),
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+let mayRaiseFPException = 1 in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
@@ -8160,6 +8228,7 @@ multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string as
}
// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
+let mayRaiseFPException = 1 in
class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
string dst_kind, string lhs_kind,
string rhs_kind, RegisterOperand RegType,
@@ -8187,6 +8256,7 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
V128, v4f32, v8f16, OpNode>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
@@ -8369,6 +8439,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
V128:$Rm, VectorIndexD:$idx)>;
}
+let mayRaiseFPException = 1 in
multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
@@ -8701,9 +8772,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$Rd),
- (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))]> {
+ (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
bits<3> idx;
let Inst{11} = idx{2};
@@ -8728,9 +8798,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$Rd),
- (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))]> {
+ (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
@@ -8793,10 +8862,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqdmull
- (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16
- (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))))]> {
+ (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
@@ -8825,10 +8892,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
(v2i64 (int_aarch64_neon_sqdmull
- (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32
- (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))))]> {
+ (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
@@ -8881,9 +8946,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$Rd),
- (OpNode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))]> {
+ (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
bits<3> idx;
let Inst{11} = idx{2};
@@ -8908,9 +8972,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$Rd),
- (OpNode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))]> {
+ (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
@@ -8940,9 +9003,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd),
- (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))]> {
+ (extract_high_v8i16 (v8i16 V128:$Rn)),
+ (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
@@ -8967,9 +9029,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd),
- (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))]> {
+ (extract_high_v4i32 (v4i32 V128:$Rn)),
+ (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
@@ -9654,7 +9715,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
V128, V128, vecshiftL8,
asm#"2", ".8h", ".16b",
[(set (v8i16 V128:$Rd),
- (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+ (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), vecshiftL8:$imm))]> {
bits<3> imm;
let Inst{18-16} = imm;
}
@@ -9670,7 +9731,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
V128, V128, vecshiftL16,
asm#"2", ".4s", ".8h",
[(set (v4i32 V128:$Rd),
- (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+ (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), vecshiftL16:$imm))]> {
bits<4> imm;
let Inst{19-16} = imm;
@@ -9687,7 +9748,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
V128, V128, vecshiftL32,
asm#"2", ".2d", ".4s",
[(set (v2i64 V128:$Rd),
- (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+ (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), vecshiftL32:$imm))]> {
bits<5> imm;
let Inst{20-16} = imm;
}
@@ -10671,7 +10732,7 @@ def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <
let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
let PrintMethod = "printComplexRotationOp<180, 90>";
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
RegisterOperand regtype, Operand rottype,
string asm, string kind, list<dag> pattern>
@@ -10742,7 +10803,7 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
}
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
bits<3> opcode,
RegisterOperand regtype,
@@ -10814,7 +10875,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
}
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in
class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
bit opc1, bit opc2, RegisterOperand dst_reg,
RegisterOperand lhs_reg,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a9191924129c..835a7b6cc81d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -42,6 +42,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -1094,7 +1095,10 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
return true;
default:;
}
- return isSEHInstruction(MI);
+ if (isSEHInstruction(MI))
+ return true;
+ auto Next = std::next(MI.getIterator());
+ return Next != MBB->end() && Next->isCFIInstruction();
}
/// analyzeCompare - For a comparison instruction, return the source registers
@@ -1435,7 +1439,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return false;
const MCInstrDesc &MCID = get(NewOpc);
CmpInstr.setDesc(MCID);
- CmpInstr.RemoveOperand(DeadNZCVIdx);
+ CmpInstr.removeOperand(DeadNZCVIdx);
bool succeeded = UpdateOperandRegClass(CmpInstr);
(void)succeeded;
assert(succeeded && "Some operands reg class are incompatible!");
@@ -1547,27 +1551,6 @@ findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
}
}
-namespace {
-
-struct UsedNZCV {
- bool N = false;
- bool Z = false;
- bool C = false;
- bool V = false;
-
- UsedNZCV() = default;
-
- UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
- this->N |= UsedFlags.N;
- this->Z |= UsedFlags.Z;
- this->C |= UsedFlags.C;
- this->V |= UsedFlags.V;
- return *this;
- }
-};
-
-} // end anonymous namespace
-
/// Find a condition code used by the instruction.
/// Returns AArch64CC::Invalid if either the instruction does not use condition
/// codes or we don't optimize CmpInstr in the presence of such instructions.
@@ -1622,15 +1605,15 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
return UsedFlags;
}
-/// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
-/// are not containing C or V flags and NZCV flags are not alive in successors
-/// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
///
/// Collect instructions using that flags in \p CCUseInstrs if provided.
-static Optional<UsedNZCV>
-examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
- const TargetRegisterInfo &TRI,
- SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
+Optional<UsedNZCV>
+llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+ const TargetRegisterInfo &TRI,
+ SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
MachineBasicBlock *CmpParent = CmpInstr.getParent();
if (MI.getParent() != CmpParent)
return None;
@@ -1652,8 +1635,6 @@ examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
break;
}
- if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
- return None;
return NZCVUsedAfterCmp;
}
@@ -1684,7 +1665,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
return false;
- if (!examineCFlagsUse(MI, CmpInstr, TRI))
+ Optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
+ if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V)
return false;
AccessKind AccessToCheck = AK_Write;
@@ -1773,7 +1755,7 @@ static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
// Condition flags are not used in CmpInstr basic block successors and only
// Z or N flags allowed to be used after CmpInstr within its basic block
- if (!NZCVUsedAfterCmp)
+ if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
return false;
// Z or N flag used after CmpInstr must correspond to the flag used in MI
if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
@@ -2270,6 +2252,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::LD1SW_D_IMM:
case AArch64::LD1D_IMM:
+ case AArch64::LD2B_IMM:
+ case AArch64::LD2H_IMM:
+ case AArch64::LD2W_IMM:
+ case AArch64::LD2D_IMM:
+ case AArch64::LD3B_IMM:
+ case AArch64::LD3H_IMM:
+ case AArch64::LD3W_IMM:
+ case AArch64::LD3D_IMM:
+ case AArch64::LD4B_IMM:
+ case AArch64::LD4H_IMM:
+ case AArch64::LD4W_IMM:
+ case AArch64::LD4D_IMM:
+
case AArch64::ST1B_IMM:
case AArch64::ST1B_H_IMM:
case AArch64::ST1B_S_IMM:
@@ -2281,6 +2276,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::ST1W_D_IMM:
case AArch64::ST1D_IMM:
+ case AArch64::ST2B_IMM:
+ case AArch64::ST2H_IMM:
+ case AArch64::ST2W_IMM:
+ case AArch64::ST2D_IMM:
+ case AArch64::ST3B_IMM:
+ case AArch64::ST3H_IMM:
+ case AArch64::ST3W_IMM:
+ case AArch64::ST3D_IMM:
+ case AArch64::ST4B_IMM:
+ case AArch64::ST4H_IMM:
+ case AArch64::ST4W_IMM:
+ case AArch64::ST4D_IMM:
+
case AArch64::LD1RB_IMM:
case AArch64::LD1RB_H_IMM:
case AArch64::LD1RB_S_IMM:
@@ -2897,6 +2905,45 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MinOffset = -8;
MaxOffset = 7;
break;
+ case AArch64::LD2B_IMM:
+ case AArch64::LD2H_IMM:
+ case AArch64::LD2W_IMM:
+ case AArch64::LD2D_IMM:
+ case AArch64::ST2B_IMM:
+ case AArch64::ST2H_IMM:
+ case AArch64::ST2W_IMM:
+ case AArch64::ST2D_IMM:
+ Scale = TypeSize::Scalable(32);
+ Width = SVEMaxBytesPerVector * 2;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
+ case AArch64::LD3B_IMM:
+ case AArch64::LD3H_IMM:
+ case AArch64::LD3W_IMM:
+ case AArch64::LD3D_IMM:
+ case AArch64::ST3B_IMM:
+ case AArch64::ST3H_IMM:
+ case AArch64::ST3W_IMM:
+ case AArch64::ST3D_IMM:
+ Scale = TypeSize::Scalable(48);
+ Width = SVEMaxBytesPerVector * 3;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
+ case AArch64::LD4B_IMM:
+ case AArch64::LD4H_IMM:
+ case AArch64::LD4W_IMM:
+ case AArch64::LD4D_IMM:
+ case AArch64::ST4B_IMM:
+ case AArch64::ST4H_IMM:
+ case AArch64::ST4W_IMM:
+ case AArch64::ST4D_IMM:
+ Scale = TypeSize::Scalable(64);
+ Width = SVEMaxBytesPerVector * 4;
+ MinOffset = -8;
+ MaxOffset = 7;
+ break;
case AArch64::LD1B_H_IMM:
case AArch64::LD1SB_H_IMM:
case AArch64::LD1H_S_IMM:
@@ -3105,6 +3152,86 @@ bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
return isPreLd(MI) || isPreSt(MI);
}
+bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ case AArch64::STGPi:
+ return true;
+ }
+}
+
+const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
+ unsigned Idx =
+ AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
+ : 1;
+ return MI.getOperand(Idx);
+}
+
+const MachineOperand &
+AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
+ unsigned Idx =
+ AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
+ : 2;
+ return MI.getOperand(Idx);
+}
+
+static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
+ Register Reg) {
+ if (MI.getParent() == nullptr)
+ return nullptr;
+ const MachineFunction *MF = MI.getParent()->getParent();
+ return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
+}
+
+bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
+ auto IsQFPR = [&](const MachineOperand &Op) {
+ if (!Op.isReg())
+ return false;
+ auto Reg = Op.getReg();
+ if (Reg.isPhysical())
+ return AArch64::FPR128RegClass.contains(Reg);
+ const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+ return TRC == &AArch64::FPR128RegClass ||
+ TRC == &AArch64::FPR128_loRegClass;
+ };
+ return llvm::any_of(MI.operands(), IsQFPR);
+}
+
+bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
+ auto IsFPR = [&](const MachineOperand &Op) {
+ if (!Op.isReg())
+ return false;
+ auto Reg = Op.getReg();
+ if (Reg.isPhysical())
+ return AArch64::FPR128RegClass.contains(Reg) ||
+ AArch64::FPR64RegClass.contains(Reg) ||
+ AArch64::FPR32RegClass.contains(Reg) ||
+ AArch64::FPR16RegClass.contains(Reg) ||
+ AArch64::FPR8RegClass.contains(Reg);
+
+ const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+ return TRC == &AArch64::FPR128RegClass ||
+ TRC == &AArch64::FPR128_loRegClass ||
+ TRC == &AArch64::FPR64RegClass ||
+ TRC == &AArch64::FPR64_loRegClass ||
+ TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
+ TRC == &AArch64::FPR8RegClass;
+ };
+ return llvm::any_of(MI.operands(), IsFPR);
+}
+
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
// scaled.
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
@@ -3370,7 +3497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Predicate register by ORRing with itself.
if (AArch64::PPRRegClass.contains(DestReg) &&
AArch64::PPRRegClass.contains(SrcReg)) {
- assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+ "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
.addReg(SrcReg) // Pg
.addReg(SrcReg)
@@ -3381,7 +3509,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register by ORRing with itself.
if (AArch64::ZPRRegClass.contains(DestReg) &&
AArch64::ZPRRegClass.contains(SrcReg)) {
- assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+ "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
.addReg(SrcReg)
.addReg(SrcReg, getKillRegState(KillSrc));
@@ -3391,6 +3520,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register pair by copying the individual sub-registers.
if (AArch64::ZPR2RegClass.contains(DestReg) &&
AArch64::ZPR2RegClass.contains(SrcReg)) {
+ assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
Indices);
@@ -3400,6 +3531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register triple by copying the individual sub-registers.
if (AArch64::ZPR3RegClass.contains(DestReg) &&
AArch64::ZPR3RegClass.contains(SrcReg)) {
+ assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -3410,6 +3543,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register quad by copying the individual sub-registers.
if (AArch64::ZPR4RegClass.contains(DestReg) &&
AArch64::ZPR4RegClass.contains(SrcReg)) {
+ assert((Subtarget.hasSVE() || Subtarget.hasSME()) &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2, AArch64::zsub3};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -3979,6 +4114,119 @@ void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
}
}
+// Convenience function to create a DWARF expression for
+// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
+static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
+ int NumVGScaledBytes, unsigned VG,
+ llvm::raw_string_ostream &Comment) {
+ uint8_t buffer[16];
+
+ if (NumBytes) {
+ Expr.push_back(dwarf::DW_OP_consts);
+ Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
+ Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+ Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
+ }
+
+ if (NumVGScaledBytes) {
+ Expr.push_back((uint8_t)dwarf::DW_OP_consts);
+ Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
+
+ Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
+ Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
+ Expr.push_back(0);
+
+ Expr.push_back((uint8_t)dwarf::DW_OP_mul);
+ Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+
+ Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
+ << std::abs(NumVGScaledBytes) << " * VG";
+ }
+}
+
+// Creates an MCCFIInstruction:
+// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
+static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
+ unsigned Reg,
+ const StackOffset &Offset) {
+ int64_t NumBytes, NumVGScaledBytes;
+ AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
+ NumVGScaledBytes);
+ std::string CommentBuffer;
+ llvm::raw_string_ostream Comment(CommentBuffer);
+
+ if (Reg == AArch64::SP)
+ Comment << "sp";
+ else if (Reg == AArch64::FP)
+ Comment << "fp";
+ else
+ Comment << printReg(Reg, &TRI);
+
+ // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
+ SmallString<64> Expr;
+ unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+ Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
+ Expr.push_back(0);
+ appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
+ TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+ // Wrap this into DW_CFA_def_cfa.
+ SmallString<64> DefCfaExpr;
+ DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
+ uint8_t buffer[16];
+ DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
+ DefCfaExpr.append(Expr.str());
+ return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
+ Comment.str());
+}
+
+MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
+ unsigned FrameReg, unsigned Reg,
+ const StackOffset &Offset,
+ bool LastAdjustmentWasScalable) {
+ if (Offset.getScalable())
+ return createDefCFAExpression(TRI, Reg, Offset);
+
+ if (FrameReg == Reg && !LastAdjustmentWasScalable)
+ return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
+
+ unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+ return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
+}
+
+MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
+ unsigned Reg,
+ const StackOffset &OffsetFromDefCFA) {
+ int64_t NumBytes, NumVGScaledBytes;
+ AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+ OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
+
+ unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+
+ // Non-scalable offsets can use DW_CFA_offset directly.
+ if (!NumVGScaledBytes)
+ return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
+
+ std::string CommentBuffer;
+ llvm::raw_string_ostream Comment(CommentBuffer);
+ Comment << printReg(Reg, &TRI) << " @ cfa";
+
+ // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
+ SmallString<64> OffsetExpr;
+ appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
+ TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+ // Wrap this into DW_CFA_expression
+ SmallString<64> CfaExpr;
+ CfaExpr.push_back(dwarf::DW_CFA_expression);
+ uint8_t buffer[16];
+ CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
+ CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
+ CfaExpr.append(OffsetExpr.str());
+
+ return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
+}
+
// Helper function to emit a frame offset adjustment from a given
// pointer (SrcReg), stored into DestReg. This function is explicit
// in that it requires the opcode.
@@ -3988,7 +4236,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
unsigned SrcReg, int64_t Offset, unsigned Opc,
const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool NeedsWinCFI,
- bool *HasWinCFI) {
+ bool *HasWinCFI, bool EmitCFAOffset,
+ StackOffset CFAOffset, unsigned FrameReg) {
int Sign = 1;
unsigned MaxEncoding, ShiftSize;
switch (Opc) {
@@ -4013,6 +4262,13 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
llvm_unreachable("Unsupported opcode");
}
+ // `Offset` can be in bytes or in "scalable bytes".
+ int VScale = 1;
+ if (Opc == AArch64::ADDVL_XXI)
+ VScale = 16;
+ else if (Opc == AArch64::ADDPL_XXI)
+ VScale = 2;
+
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
// scratch register. If DestReg is a virtual register, use it as the
// scratch register; otherwise, create a new virtual register (to be
@@ -4050,6 +4306,26 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
MBI = MBI.setMIFlag(Flag);
+ auto Change =
+ VScale == 1
+ ? StackOffset::getFixed(ThisVal << LocalShiftSize)
+ : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
+ if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
+ CFAOffset += Change;
+ else
+ CFAOffset -= Change;
+ if (EmitCFAOffset && DestReg == TmpReg) {
+ MachineFunction &MF = *MBB.getParent();
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+ unsigned CFIIndex = MF.addFrameInst(
+ createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(Flag);
+ }
+
if (NeedsWinCFI) {
assert(Sign == 1 && "SEH directives should always have a positive sign");
int Imm = (int)(ThisVal << LocalShiftSize);
@@ -4086,7 +4362,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SrcReg,
StackOffset Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool SetNZCV,
- bool NeedsWinCFI, bool *HasWinCFI) {
+ bool NeedsWinCFI, bool *HasWinCFI,
+ bool EmitCFAOffset, StackOffset CFAOffset,
+ unsigned FrameReg) {
int64_t Bytes, NumPredicateVectors, NumDataVectors;
AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
Offset, Bytes, NumPredicateVectors, NumDataVectors);
@@ -4101,8 +4379,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
}
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
- NeedsWinCFI, HasWinCFI);
+ NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
+ FrameReg);
+ CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
+ ? StackOffset::getFixed(-Bytes)
+ : StackOffset::getFixed(Bytes);
SrcReg = DestReg;
+ FrameReg = DestReg;
}
assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
@@ -4112,14 +4395,17 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
if (NumDataVectors) {
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
- AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+ AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr,
+ EmitCFAOffset, CFAOffset, FrameReg);
+ CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
SrcReg = DestReg;
}
if (NumPredicateVectors) {
assert(DestReg != AArch64::SP && "Unaligned access to SP");
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
- AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+ AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr,
+ EmitCFAOffset, CFAOffset, FrameReg);
}
}
@@ -4151,6 +4437,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
return nullptr;
}
+ // Nothing can folded with copy from/to NZCV.
+ if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
+ return nullptr;
}
// Handle the case where a copy is being spilled or filled but the source
@@ -4577,6 +4866,10 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
return false;
}
+ if (isCombineInstrSettingFlag(CombineOpc) &&
+ MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return false;
+
return true;
}
@@ -4919,6 +5212,10 @@ static bool getFMULPatterns(MachineInstr &Root,
MachineInstr *MI = nullptr;
if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
+ // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
+ if (MI && MI->getOpcode() == TargetOpcode::COPY &&
+ MI->getOperand(1).getReg().isVirtual())
+ MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
if (MI && MI->getOpcode() == Opcode) {
Patterns.push_back(Pattern);
return true;
@@ -5073,6 +5370,42 @@ bool AArch64InstrInfo::isThroughputPattern(
} // end switch (Pattern)
return false;
}
+
+/// Find other MI combine patterns.
+static bool getMiscPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns)
+{
+ // A - (B + C) ==> (A - B) - C or (A - C) - B
+ unsigned Opc = Root.getOpcode();
+ MachineBasicBlock &MBB = *Root.getParent();
+
+ switch (Opc) {
+ case AArch64::SUBWrr:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBXrr:
+ case AArch64::SUBSXrr:
+ // Found candidate root.
+ break;
+ default:
+ return false;
+ }
+
+ if (isCombineInstrSettingFlag(Opc) &&
+ Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+ return false;
+
+ if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
+ canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
+ canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
+ canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
+ Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
+ Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
+ return true;
+ }
+
+ return false;
+}
+
/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -5090,6 +5423,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getFMAPatterns(Root, Patterns))
return true;
+ // Other patterns
+ if (getMiscPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -5190,6 +5527,9 @@ genIndexedMultiply(MachineInstr &Root,
MachineInstr *Dup =
MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
+ if (Dup->getOpcode() == TargetOpcode::COPY)
+ Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
+
Register DupSrcReg = Dup->getOperand(1).getReg();
MRI.clearKillFlags(DupSrcReg);
MRI.constrainRegClass(DupSrcReg, RC);
@@ -5337,6 +5677,53 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
return MUL;
}
+/// Do the following transformation
+/// A - (B + C) ==> (A - B) - C
+/// A - (B + C) ==> (A - C) - B
+static void
+genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ unsigned IdxOpd1,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+ assert(IdxOpd1 == 1 || IdxOpd1 == 2);
+ unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
+ MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+
+ Register ResultReg = Root.getOperand(0).getReg();
+ Register RegA = Root.getOperand(1).getReg();
+ bool RegAIsKill = Root.getOperand(1).isKill();
+ Register RegB = AddMI->getOperand(IdxOpd1).getReg();
+ bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
+ Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
+ bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
+ Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
+
+ unsigned Opcode = Root.getOpcode();
+ if (Opcode == AArch64::SUBSWrr)
+ Opcode = AArch64::SUBWrr;
+ else if (Opcode == AArch64::SUBSXrr)
+ Opcode = AArch64::SUBXrr;
+ else
+ assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
+ "Unexpected instruction opcode.");
+
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), NewVR)
+ .addReg(RegA, getKillRegState(RegAIsKill))
+ .addReg(RegB, getKillRegState(RegBIsKill));
+ MachineInstrBuilder MIB2 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), ResultReg)
+ .addReg(NewVR, getKillRegState(true))
+ .addReg(RegC, getKillRegState(RegCIsKill));
+
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ InsInstrs.push_back(MIB1);
+ InsInstrs.push_back(MIB2);
+ DelInstrs.push_back(AddMI);
+}
+
/// When getMachineCombinerPatterns() finds potential patterns,
/// this function generates the instructions that could replace the
/// original code sequence
@@ -5359,6 +5746,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
DelInstrs, InstrIdxForVirtReg);
return;
+ case MachineCombinerPattern::SUBADD_OP1:
+ // A - (B + C)
+ // ==> (A - B) - C
+ genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
+ InstrIdxForVirtReg);
+ break;
+ case MachineCombinerPattern::SUBADD_OP2:
+ // A - (B + C)
+ // ==> (A - C) - B
+ genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
+ InstrIdxForVirtReg);
+ break;
case MachineCombinerPattern::MULADDW_OP1:
case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0
@@ -6214,6 +6613,14 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
if (MUL)
DelInstrs.push_back(MUL);
DelInstrs.push_back(&Root);
+
+ // Set the flags on the inserted instructions to be the merged flags of the
+ // instructions that we have combined.
+ uint16_t Flags = Root.getFlags();
+ if (MUL)
+ Flags = Root.mergeFlagsWith(*MUL);
+ for (auto *MI : InsInstrs)
+ MI->setFlags(Flags);
}
/// Replace csincr-branch sequence by simple conditional branch
@@ -6526,13 +6933,12 @@ enum MachineOutlinerMBBFlags {
UnsafeRegsDead = 0x8
};
-unsigned
-AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
- assert(C.LRUWasSet && "LRU wasn't set?");
+Register
+AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
MachineFunction *MF = C.getMF();
- const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
-
+ const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+ const AArch64RegisterInfo *ARI =
+ static_cast<const AArch64RegisterInfo *>(&TRI);
// Check if there is an available register across the sequence that we can
// use.
for (unsigned Reg : AArch64::GPR64RegClass) {
@@ -6540,12 +6946,11 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
Reg != AArch64::LR && // LR is not reserved, but don't use it.
Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
Reg != AArch64::X17 && // Ditto for X17.
- C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
+ C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
+ C.isAvailableInsideSeq(Reg, TRI))
return Reg;
}
-
- // No suitable register. Return 0.
- return 0u;
+ return Register();
}
static bool
@@ -6691,10 +7096,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
unsigned FlagsSetInAll = 0xF;
// Compute liveness information for each candidate, and set FlagsSetInAll.
- std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
- [&FlagsSetInAll](outliner::Candidate &C) {
- FlagsSetInAll &= C.Flags;
- });
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ FlagsSetInAll &= C.Flags;
// According to the AArch64 Procedure Call Standard, the following are
// undefined on entry/exit from a function call:
@@ -6712,10 +7115,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// to compute liveness here.
if (C.Flags & UnsafeRegsDead)
return false;
- C.initLRU(TRI);
- LiveRegUnits LRU = C.LRU;
- return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
- !LRU.available(AArch64::NZCV));
+ return C.isAnyUnavailableAcrossOrOutOfSeq(
+ {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI);
};
// Are there any candidates where those registers are live?
@@ -6752,12 +7153,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// We check to see if CFI Instructions are present, and if they are
// we find the number of CFI Instructions in the candidates.
unsigned CFICount = 0;
- MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
- for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
- Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
- if (MBBI->isCFIInstruction())
+ for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
+ std::next(RepeatedSequenceLocs[0].back()))) {
+ if (I.isCFIInstruction())
CFICount++;
- MBBI++;
}
// We compare the number of found CFI Instructions to the number of CFI
@@ -6860,8 +7259,6 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// Check if we have to save LR.
for (outliner::Candidate &C : RepeatedSequenceLocs) {
- C.initLRU(TRI);
-
// If we have a noreturn caller, then we're going to be conservative and
// say that we have to save LR. If we don't have a ret at the end of the
// block, then we can't reason about liveness accurately.
@@ -6872,7 +7269,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
// Is LR available? If so, we don't need a save.
- if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
+ if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) {
NumBytesNoStackCalls += 4;
C.setCallInfo(MachineOutlinerNoLRSave, 4);
CandidatesWithoutStackFixups.push_back(C);
@@ -6888,7 +7285,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// Is SP used in the sequence at all? If not, we don't have to modify
// the stack, so we are guaranteed to get the same frame.
- else if (C.UsedInSequence.available(AArch64::SP)) {
+ else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
NumBytesNoStackCalls += 12;
C.setCallInfo(MachineOutlinerDefault, 12);
CandidatesWithoutStackFixups.push_back(C);
@@ -6957,11 +7354,12 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// LR to (ie one extra stack save/restore).
//
if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
- erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+ erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
return (std::any_of(
C.front(), std::next(C.back()),
[](const MachineInstr &MI) { return MI.isCall(); })) &&
- (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
+ (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
+ !findRegisterToSaveLRTo(C));
});
}
}
@@ -7032,7 +7430,7 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
// modify the stack. Check if hasRedZone is true or unknown; if yes, don't
// outline from it.
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (!AFI || AFI->hasRedZone().getValueOr(true))
+ if (!AFI || AFI->hasRedZone().value_or(true))
return false;
// FIXME: Teach the outliner to generate/handle Windows unwind info.
@@ -7053,8 +7451,8 @@ bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
"Suitable Machine Function for outlining must track liveness");
LiveRegUnits LRU(getRegisterInfo());
- std::for_each(MBB.rbegin(), MBB.rend(),
- [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+ for (MachineInstr &MI : llvm::reverse(MBB))
+ LRU.accumulate(MI);
// Check if each of the unsafe registers are available...
bool W16AvailableInBlock = LRU.available(AArch64::W16);
@@ -7333,14 +7731,17 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(AArch64::SP, RegState::InternalRead);
MI.setMIFlag(MachineInstr::FrameSetup);
- unsigned CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
- BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
+ if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
// If v8.3a features are available we can replace a RET instruction by
- // RETAA or RETAB and omit the AUT instructions
+ // RETAA or RETAB and omit the AUT instructions. In this case the
+ // DW_CFA_AARCH64_negate_ra_state can't be emitted.
if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
MBBAUT->getOpcode() == AArch64::RET) {
BuildMI(MBB, MBBAUT, DL,
@@ -7353,6 +7754,11 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
: AArch64::AUTIBSP))
.setMIFlag(MachineInstr::FrameDestroy);
+ unsigned CFIIndexAuth =
+ MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndexAuth)
+ .setMIFlags(MachineInstr::FrameDestroy);
}
}
}
@@ -7424,24 +7830,26 @@ void AArch64InstrInfo::buildOutlinedFrame(
.addImm(-16);
It = MBB.insert(It, STRXpre);
- const TargetSubtargetInfo &STI = MF.getSubtarget();
- const MCRegisterInfo *MRI = STI.getRegisterInfo();
- unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
-
- // Add a CFI saying the stack was moved 16 B down.
- int64_t StackPosEntry =
- MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
- BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
- .addCFIIndex(StackPosEntry)
- .setMIFlags(MachineInstr::FrameSetup);
-
- // Add a CFI saying that the LR that we want to find is now 16 B higher than
- // before.
- int64_t LRPosEntry =
- MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
- BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
- .addCFIIndex(LRPosEntry)
- .setMIFlags(MachineInstr::FrameSetup);
+ if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) {
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const MCRegisterInfo *MRI = STI.getRegisterInfo();
+ unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
+
+ // Add a CFI saying the stack was moved 16 B down.
+ int64_t StackPosEntry =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
+ BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(StackPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ // Add a CFI saying that the LR that we want to find is now 16 B higher
+ // than before.
+ int64_t LRPosEntry = MF.addFrameInst(
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
+ BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
// Insert a restore before the terminator for the function.
MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
@@ -7495,7 +7903,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
- MachineFunction &MF, const outliner::Candidate &C) const {
+ MachineFunction &MF, outliner::Candidate &C) const {
// Are we tail calling?
if (C.CallConstructionID == MachineOutlinerTailCall) {
@@ -7526,8 +7934,8 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
if (C.CallConstructionID == MachineOutlinerRegSave) {
// FIXME: This logic should be sunk into a target-specific interface so that
// we don't have to recompute the register.
- unsigned Reg = findRegisterToSaveLRTo(C);
- assert(Reg != 0 && "No callee-saved register available?");
+ Register Reg = findRegisterToSaveLRTo(C);
+ assert(Reg && "No callee-saved register available?");
// LR has to be a live in so that we can save it.
if (!MBB.isLiveIn(AArch64::LR))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 1054bea40e68..b7a6ac301cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -103,6 +103,21 @@ public:
/// Returns whether the instruction is a pre-indexed load/store.
static bool isPreLdSt(const MachineInstr &MI);
+ /// Returns whether the instruction is a paired load/store.
+ static bool isPairedLdSt(const MachineInstr &MI);
+
+ /// Returns the base register operator of a load/store.
+ static const MachineOperand &getLdStBaseOp(const MachineInstr &MI);
+
+ /// Returns the the immediate offset operator of a load/store.
+ static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
+
+ /// Returns whether the instruction is FP or NEON.
+ static bool isFpOrNEON(const MachineInstr &MI);
+
+ /// Returns whether the instruction is in Q form (128 bit operands)
+ static bool isQForm(const MachineInstr &MI);
+
/// Returns the index for the immediate for a given instruction.
static unsigned getLoadStoreImmIdx(unsigned Opc);
@@ -283,7 +298,7 @@ public:
MachineBasicBlock::iterator
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It, MachineFunction &MF,
- const outliner::Candidate &C) const override;
+ outliner::Candidate &C) const override;
bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
/// Returns the vector element size (B, H, S or D) of an SVE opcode.
uint64_t getElementSizeForOpcode(unsigned Opc) const;
@@ -347,7 +362,7 @@ private:
/// Returns an unused general-purpose register which can be used for
/// constructing an outlined call if one exists. Returns 0 otherwise.
- unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+ Register findRegisterToSaveLRTo(outliner::Candidate &C) const;
/// Remove a ptest of a predicate-generating operation that already sets, or
/// can be made to set, the condition codes in an identical manner
@@ -356,12 +371,45 @@ private:
const MachineRegisterInfo *MRI) const;
};
+struct UsedNZCV {
+ bool N = false;
+ bool Z = false;
+ bool C = false;
+ bool V = false;
+
+ UsedNZCV() = default;
+
+ UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
+ this->N |= UsedFlags.N;
+ this->Z |= UsedFlags.Z;
+ this->C |= UsedFlags.C;
+ this->V |= UsedFlags.V;
+ return *this;
+ }
+};
+
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
+///
+/// Collect instructions using that flags in \p CCUseInstrs if provided.
+Optional<UsedNZCV>
+examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+ const TargetRegisterInfo &TRI,
+ SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr);
+
/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
/// which either reads or clobbers NZCV.
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
const MachineInstr &UseMI,
const TargetRegisterInfo *TRI);
+MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg,
+ unsigned Reg, const StackOffset &Offset,
+ bool LastAdjustmentWasScalable = true);
+MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg,
+ const StackOffset &OffsetFromDefCFA);
+
/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
/// plus Offset. This is intended to be used from within the prolog/epilog
/// insertion (PEI) pass, where a virtual scratch register may be allocated
@@ -371,7 +419,9 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
StackOffset Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag = MachineInstr::NoFlags,
bool SetNZCV = false, bool NeedsWinCFI = false,
- bool *HasWinCFI = nullptr);
+ bool *HasWinCFI = nullptr, bool EmitCFAOffset = false,
+ StackOffset InitialOffset = {},
+ unsigned FrameReg = AArch64::SP);
/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
/// FP. Return false if the offset could not be handled directly in MI, and
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 83bf89ff97c5..3802a45ad6c1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,196 +14,196 @@
// ARM Instruction Predicate Definitions.
//
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
- AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
+ AssemblerPredicateWithAll<(all_of HasV8_1aOps), "armv8.1a">;
def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
- AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
+ AssemblerPredicateWithAll<(all_of HasV8_2aOps), "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
- AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
+ AssemblerPredicateWithAll<(all_of HasV8_3aOps), "armv8.3a">;
def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
- AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
+ AssemblerPredicateWithAll<(all_of HasV8_4aOps), "armv8.4a">;
def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
- AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+ AssemblerPredicateWithAll<(all_of HasV8_5aOps), "armv8.5a">;
def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">,
- AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+ AssemblerPredicateWithAll<(all_of HasV8_6aOps), "armv8.6a">;
def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">,
- AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
+ AssemblerPredicateWithAll<(all_of HasV8_7aOps), "armv8.7a">;
def HasV9_0a : Predicate<"Subtarget->hasV9_0aOps()">,
- AssemblerPredicate<(all_of HasV9_0aOps), "armv9-a">;
+ AssemblerPredicateWithAll<(all_of HasV9_0aOps), "armv9-a">;
def HasV9_1a : Predicate<"Subtarget->hasV9_1aOps()">,
- AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">;
+ AssemblerPredicateWithAll<(all_of HasV9_1aOps), "armv9.1a">;
def HasV9_2a : Predicate<"Subtarget->hasV9_2aOps()">,
- AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">;
+ AssemblerPredicateWithAll<(all_of HasV9_2aOps), "armv9.2a">;
def HasV9_3a : Predicate<"Subtarget->hasV9_3aOps()">,
- AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">;
+ AssemblerPredicateWithAll<(all_of HasV9_3aOps), "armv9.3a">;
def HasV8_0r : Predicate<"Subtarget->hasV8_0rOps()">,
- AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">;
+ AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">;
def HasEL2VMSA : Predicate<"Subtarget->hasEL2VMSA()">,
- AssemblerPredicate<(all_of FeatureEL2VMSA), "el2vmsa">;
+ AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
def HasEL3 : Predicate<"Subtarget->hasEL3()">,
- AssemblerPredicate<(all_of FeatureEL3), "el3">;
+ AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
def HasVH : Predicate<"Subtarget->hasVH()">,
- AssemblerPredicate<(all_of FeatureVH), "vh">;
+ AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
def HasLOR : Predicate<"Subtarget->hasLOR()">,
- AssemblerPredicate<(all_of FeatureLOR), "lor">;
+ AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
def HasPAuth : Predicate<"Subtarget->hasPAuth()">,
- AssemblerPredicate<(all_of FeaturePAuth), "pauth">;
+ AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
def HasJS : Predicate<"Subtarget->hasJS()">,
- AssemblerPredicate<(all_of FeatureJS), "jsconv">;
+ AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">,
- AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">;
+ AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">,
- AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">;
+ AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
def HasNV : Predicate<"Subtarget->hasNV()">,
- AssemblerPredicate<(all_of FeatureNV), "nv">;
+ AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
def HasMPAM : Predicate<"Subtarget->hasMPAM()">,
- AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
+ AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
def HasDIT : Predicate<"Subtarget->hasDIT()">,
- AssemblerPredicate<(all_of FeatureDIT), "dit">;
+ AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">,
- AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">;
+ AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
def HasAM : Predicate<"Subtarget->hasAM()">,
- AssemblerPredicate<(all_of FeatureAM), "am">;
+ AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
- AssemblerPredicate<(all_of FeatureSEL2), "sel2">;
+ AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
- AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
+ AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
def HasFlagM : Predicate<"Subtarget->hasFlagM()">,
- AssemblerPredicate<(all_of FeatureFlagM), "flagm">;
+ AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">,
- AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
+ AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
- AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">;
+ AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
- AssemblerPredicate<(all_of FeatureNEON), "neon">;
+ AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
- AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
+ AssemblerPredicateWithAll<(all_of FeatureCrypto), "crypto">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
- AssemblerPredicate<(all_of FeatureSM4), "sm4">;
+ AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
def HasSHA3 : Predicate<"Subtarget->hasSHA3()">,
- AssemblerPredicate<(all_of FeatureSHA3), "sha3">;
+ AssemblerPredicateWithAll<(all_of FeatureSHA3), "sha3">;
def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
- AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
+ AssemblerPredicateWithAll<(all_of FeatureSHA2), "sha2">;
def HasAES : Predicate<"Subtarget->hasAES()">,
- AssemblerPredicate<(all_of FeatureAES), "aes">;
+ AssemblerPredicateWithAll<(all_of FeatureAES), "aes">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
- AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
+ AssemblerPredicateWithAll<(all_of FeatureDotProd), "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
- AssemblerPredicate<(all_of FeatureCRC), "crc">;
+ AssemblerPredicateWithAll<(all_of FeatureCRC), "crc">;
def HasLSE : Predicate<"Subtarget->hasLSE()">,
- AssemblerPredicate<(all_of FeatureLSE), "lse">;
+ AssemblerPredicateWithAll<(all_of FeatureLSE), "lse">;
def HasNoLSE : Predicate<"!Subtarget->hasLSE()">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
- AssemblerPredicate<(all_of FeatureRAS), "ras">;
+ AssemblerPredicateWithAll<(all_of FeatureRAS), "ras">;
def HasRDM : Predicate<"Subtarget->hasRDM()">,
- AssemblerPredicate<(all_of FeatureRDM), "rdm">;
+ AssemblerPredicateWithAll<(all_of FeatureRDM), "rdm">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
- AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">;
+ AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
- AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">;
+ AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
- AssemblerPredicate<(all_of FeatureSPE), "spe">;
+ AssemblerPredicateWithAll<(all_of FeatureSPE), "spe">;
def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
- AssemblerPredicate<(all_of FeatureFuseAES),
+ AssemblerPredicateWithAll<(all_of FeatureFuseAES),
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->hasSVE()">,
- AssemblerPredicate<(all_of FeatureSVE), "sve">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
- AssemblerPredicate<(all_of FeatureSVE2), "sve2">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
- AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
- AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
- AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
+ AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
def HasSME : Predicate<"Subtarget->hasSME()">,
- AssemblerPredicate<(all_of FeatureSME), "sme">;
+ AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
def HasSMEF64 : Predicate<"Subtarget->hasSMEF64()">,
- AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">;
+ AssemblerPredicateWithAll<(all_of FeatureSMEF64), "sme-f64">;
def HasSMEI64 : Predicate<"Subtarget->hasSMEI64()">,
- AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">;
-def HasStreamingSVE : Predicate<"Subtarget->hasStreamingSVE()">,
- AssemblerPredicate<(all_of FeatureStreamingSVE), "streaming-sve">;
+ AssemblerPredicateWithAll<(all_of FeatureSMEI64), "sme-i64">;
// A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
// they should be enabled if either has been specified.
-def HasSVEorStreamingSVE
- : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">,
- AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE),
- "streaming-sve or sve">;
-def HasSVE2orStreamingSVE
- : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">,
- AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE),
- "streaming-sve or sve2">;
+def HasSVEorSME
+ : Predicate<"Subtarget->hasSVE() || Subtarget->hasSME()">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
+ "sve or sme">;
+def HasSVE2orSME
+ : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
+ "sve2 or sme">;
// A subset of NEON instructions are legal in Streaming SVE execution mode,
// they should be enabled if either has been specified.
-def HasNEONorStreamingSVE
- : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">,
- AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE),
- "streaming-sve or neon">;
+def HasNEONorSME
+ : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
+ AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
+ "neon or sme">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
- AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
+ AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
+def HasLDAPR : Predicate<"Subtarget->hasLDAPR()">,
+ AssemblerPredicateWithAll<(all_of FeatureLDAPR), "ldapr">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
- AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">;
+ AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">,
- AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">;
+ AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
def HasSB : Predicate<"Subtarget->hasSB()">,
- AssemblerPredicate<(all_of FeatureSB), "sb">;
+ AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
def HasPredRes : Predicate<"Subtarget->hasPredRes()">,
- AssemblerPredicate<(all_of FeaturePredRes), "predres">;
+ AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
def HasCCDP : Predicate<"Subtarget->hasCCDP()">,
- AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">;
+ AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
def HasBTI : Predicate<"Subtarget->hasBTI()">,
- AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">;
+ AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
- AssemblerPredicate<(all_of FeatureMTE), "mte">;
+ AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
def HasTME : Predicate<"Subtarget->hasTME()">,
- AssemblerPredicate<(all_of FeatureTME), "tme">;
+ AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
def HasETE : Predicate<"Subtarget->hasETE()">,
- AssemblerPredicate<(all_of FeatureETE), "ete">;
+ AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
- AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
+ AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">,
- AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+ AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
- AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+ AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
- AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+ AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">,
- AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
+ AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
def HasXS : Predicate<"Subtarget->hasXS()">,
- AssemblerPredicate<(all_of FeatureXS), "xs">;
+ AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
def HasWFxT : Predicate<"Subtarget->hasWFxT()">,
- AssemblerPredicate<(all_of FeatureWFxT), "wfxt">;
+ AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
def HasLS64 : Predicate<"Subtarget->hasLS64()">,
- AssemblerPredicate<(all_of FeatureLS64), "ls64">;
+ AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
def HasBRBE : Predicate<"Subtarget->hasBRBE()">,
- AssemblerPredicate<(all_of FeatureBRBE), "brbe">;
+ AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">,
- AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">;
+ AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
def HasHBC : Predicate<"Subtarget->hasHBC()">,
- AssemblerPredicate<(all_of FeatureHBC), "hbc">;
+ AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
def HasMOPS : Predicate<"Subtarget->hasMOPS()">,
- AssemblerPredicate<(all_of FeatureMOPS), "mops">;
+ AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@@ -350,49 +350,49 @@ def nonext_masked_load :
cast<MaskedLoadSDNode>(N)->isUnindexed() &&
!cast<MaskedLoadSDNode>(N)->isNonTemporal();
}]>;
-// sign extending masked load fragments.
-def asext_masked_load :
+// Any/Zero extending masked load fragments.
+def azext_masked_load :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
(masked_ld node:$ptr, undef, node:$pred, node:$def),[{
return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD) &&
cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
-def asext_masked_load_i8 :
+def azext_masked_load_i8 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (azext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
-def asext_masked_load_i16 :
+def azext_masked_load_i16 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (azext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
-def asext_masked_load_i32 :
+def azext_masked_load_i32 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (azext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
-// zero extending masked load fragments.
-def zext_masked_load :
+// Sign extending masked load fragments.
+def sext_masked_load :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
(masked_ld node:$ptr, undef, node:$pred, node:$def), [{
- return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD &&
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD &&
cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
-def zext_masked_load_i8 :
+def sext_masked_load_i8 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (sext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
-def zext_masked_load_i16 :
+def sext_masked_load_i16 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (sext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
-def zext_masked_load_i32 :
+def sext_masked_load_i32 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ (sext_masked_load node:$ptr, node:$pred, node:$def), [{
return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
@@ -443,6 +443,58 @@ def non_temporal_store :
cast<MaskedStoreSDNode>(N)->isNonTemporal();
}]>;
+multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
+ // offsets = (signed)Index << sizeof(elt)
+ def NAME#_signed_scaled :
+ PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+ (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+ auto MGS = cast<MaskedGatherScatterSDNode>(N);
+ bool Signed = MGS->isIndexSigned() ||
+ MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+ return Signed && MGS->isIndexScaled();
+ }]>;
+ // offsets = (signed)Index
+ def NAME#_signed_unscaled :
+ PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+ (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+ auto MGS = cast<MaskedGatherScatterSDNode>(N);
+ bool Signed = MGS->isIndexSigned() ||
+ MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+ return Signed && !MGS->isIndexScaled();
+ }]>;
+ // offsets = (unsigned)Index << sizeof(elt)
+ def NAME#_unsigned_scaled :
+ PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+ (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+ auto MGS = cast<MaskedGatherScatterSDNode>(N);
+ bool Signed = MGS->isIndexSigned() ||
+ MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+ return !Signed && MGS->isIndexScaled();
+ }]>;
+ // offsets = (unsigned)Index
+ def NAME#_unsigned_unscaled :
+ PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx),
+ (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{
+ auto MGS = cast<MaskedGatherScatterSDNode>(N);
+ bool Signed = MGS->isIndexSigned() ||
+ MGS->getIndex().getValueType().getVectorElementType() == MVT::i64;
+ return !Signed && !MGS->isIndexScaled();
+ }]>;
+}
+
+defm nonext_masked_gather : masked_gather_scatter<nonext_masked_gather>;
+defm azext_masked_gather_i8 : masked_gather_scatter<azext_masked_gather_i8>;
+defm azext_masked_gather_i16 : masked_gather_scatter<azext_masked_gather_i16>;
+defm azext_masked_gather_i32 : masked_gather_scatter<azext_masked_gather_i32>;
+defm sext_masked_gather_i8 : masked_gather_scatter<sext_masked_gather_i8>;
+defm sext_masked_gather_i16 : masked_gather_scatter<sext_masked_gather_i16>;
+defm sext_masked_gather_i32 : masked_gather_scatter<sext_masked_gather_i32>;
+
+defm nontrunc_masked_scatter : masked_gather_scatter<nontrunc_masked_scatter>;
+defm trunc_masked_scatter_i8 : masked_gather_scatter<trunc_masked_scatter_i8>;
+defm trunc_masked_scatter_i16 : masked_gather_scatter<trunc_masked_scatter_i16>;
+defm trunc_masked_scatter_i32 : masked_gather_scatter<trunc_masked_scatter_i32>;
+
// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
def top16Zero: PatLeaf<(i32 GPR32:$src), [{
return SDValue(N,0)->getValueType(0) == MVT::i32 &&
@@ -473,6 +525,11 @@ def AArch64call : SDNode<"AArch64ISD::CALL",
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
+def AArch64call_bti : SDNode<"AArch64ISD::CALL_BTI",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -526,6 +583,7 @@ def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>;
def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
@@ -612,8 +670,10 @@ def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisSameAs<1, 2>]>;
-def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
-def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
+ [SDNPCommutative]>;
+def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
+ [SDNPCommutative]>;
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
@@ -630,11 +690,6 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
-def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
-def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
-def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
-def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
-
def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
[(abdu node:$lhs, node:$rhs),
(int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
@@ -642,10 +697,21 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
[(abds node:$lhs, node:$rhs),
(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
+def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
+def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>;
+def AArch64addp : PatFrags<(ops node:$Rn, node:$Rm),
+ [(AArch64addp_n node:$Rn, node:$Rm),
+ (int_aarch64_neon_addp node:$Rn, node:$Rm)]>;
def AArch64uaddlp : PatFrags<(ops node:$src),
[(AArch64uaddlp_n node:$src),
(int_aarch64_neon_uaddlp node:$src)]>;
+def AArch64saddlp : PatFrags<(ops node:$src),
+ [(AArch64saddlp_n node:$src),
+ (int_aarch64_neon_saddlp node:$src)]>;
+def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm),
+ [(AArch64addp_n node:$Rn, node:$Rm),
+ (int_aarch64_neon_faddp node:$Rn, node:$Rm)]>;
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -669,6 +735,22 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
[SDNPHasChain, SDNPOutGlue]>;
+
+// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
+// have no common bits.
+def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
+ [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{
+ if (N->getOpcode() == ISD::ADD)
+ return true;
+ return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1));
+}]> {
+ let GISelPredicateCode = [{
+ // Only handle G_ADD for now. FIXME. build capability to compute whether
+ // operands of G_OR have common bits set or not.
+ return MI.getOpcode() == TargetOpcode::G_ADD;
+ }];
+}
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -939,7 +1021,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
VectorIndexS:$idx)>;
}
-let Predicates = [HasNEONorStreamingSVE, HasBF16] in {
+let Predicates = [HasNEONorSME, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
}
@@ -1025,6 +1107,15 @@ def : EOR3_pattern<v8i16>;
def : EOR3_pattern<v4i32>;
def : EOR3_pattern<v2i64>;
+class BCAX_pattern<ValueType VecTy>
+ : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))),
+ (BCAX (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
+
+def : BCAX_pattern<v16i8>;
+def : BCAX_pattern<v8i16>;
+def : BCAX_pattern<v4i32>;
+def : BCAX_pattern<v2i64>;
+
def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>;
def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>;
def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>;
@@ -2073,6 +2164,10 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
+def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
+ (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
+ (REV16Xr GPR64:$Rn)>;
+
//===----------------------------------------------------------------------===//
// Bitfield immediate extraction instruction.
//===----------------------------------------------------------------------===//
@@ -2320,6 +2415,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
PseudoInstExpansion<(BLR GPR64:$Rn)>;
def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>,
Sched<[WriteBrReg]>;
+ def BLR_BTI : Pseudo<(outs), (ins variable_ops), []>,
+ Sched<[WriteBrReg]>;
} // isCall
def : Pat<(AArch64call GPR64:$Rn),
@@ -2333,6 +2430,10 @@ def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn),
(BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>,
Requires<[NoSLSBLRMitigation]>;
+def : Pat<(AArch64call_bti GPR64:$Rn),
+ (BLR_BTI GPR64:$Rn)>,
+ Requires<[NoSLSBLRMitigation]>;
+
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
} // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -2359,6 +2460,10 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
// augmentation string.
def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {}
+// Pseudo instruction to tell the streamer to emit a 'G' character into the
+// augmentation string.
+def EMITMTETAGGED : Pseudo<(outs), (ins), []>, Sched<[]> {}
+
// FIXME: maybe the scratch register used shouldn't be fixed to X1?
// FIXME: can "hasSideEffects be dropped?
// This gets lowered to an instruction sequence which takes 16 bytes
@@ -2409,7 +2514,8 @@ def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
// Exception generation instructions.
//===----------------------------------------------------------------------===//
let isTrap = 1 in {
-def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
+def BRK : ExceptionGeneration<0b001, 0b00, "brk",
+ [(int_aarch64_break timm32_0_65535:$imm)]>;
}
def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
@@ -3891,24 +3997,24 @@ defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (lround f16:$Rn)),
+ def : Pat<(i32 (any_lround f16:$Rn)),
(!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
- def : Pat<(i64 (lround f16:$Rn)),
+ def : Pat<(i64 (any_lround f16:$Rn)),
(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
- def : Pat<(i64 (llround f16:$Rn)),
+ def : Pat<(i64 (any_llround f16:$Rn)),
(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
}
-def : Pat<(i32 (lround f32:$Rn)),
+def : Pat<(i32 (any_lround f32:$Rn)),
(!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
-def : Pat<(i32 (lround f64:$Rn)),
+def : Pat<(i32 (any_lround f64:$Rn)),
(!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
-def : Pat<(i64 (lround f32:$Rn)),
+def : Pat<(i64 (any_lround f32:$Rn)),
(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (lround f64:$Rn)),
+def : Pat<(i64 (any_lround f64:$Rn)),
(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
-def : Pat<(i64 (llround f32:$Rn)),
+def : Pat<(i64 (any_llround f32:$Rn)),
(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (llround f64:$Rn)),
+def : Pat<(i64 (any_llround f64:$Rn)),
(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
//===----------------------------------------------------------------------===//
@@ -3949,20 +4055,20 @@ defm FCVT : FPConversion<"fcvt">;
// Floating point single operand instructions.
//===----------------------------------------------------------------------===//
-defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+defm FABS : SingleOperandFPDataNoException<0b0001, "fabs", fabs>;
+defm FMOV : SingleOperandFPDataNoException<0b0000, "fmov">;
+defm FNEG : SingleOperandFPDataNoException<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", any_fround>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", any_fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", any_ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", any_froundeven>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", any_fceil>;
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", any_frint>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", any_ftrunc>;
let SchedRW = [WriteFDiv] in {
-defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", any_fsqrt>;
}
let Predicates = [HasFRInt3264] in {
@@ -3972,44 +4078,48 @@ let Predicates = [HasFRInt3264] in {
defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>;
} // HasFRInt3264
+// Emitting strict_lrint as two instructions is valid as any exceptions that
+// occur will happen in exactly one of the instructions (e.g. if the input is
+// not an integer the inexact exception will happen in the FRINTX but not then
+// in the FCVTZS as the output of FRINTX is an integer).
let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (lrint f16:$Rn)),
+ def : Pat<(i32 (any_lrint f16:$Rn)),
(FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
- def : Pat<(i64 (lrint f16:$Rn)),
+ def : Pat<(i64 (any_lrint f16:$Rn)),
(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
- def : Pat<(i64 (llrint f16:$Rn)),
+ def : Pat<(i64 (any_llrint f16:$Rn)),
(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
}
-def : Pat<(i32 (lrint f32:$Rn)),
+def : Pat<(i32 (any_lrint f32:$Rn)),
(FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i32 (lrint f64:$Rn)),
+def : Pat<(i32 (any_lrint f64:$Rn)),
(FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (lrint f32:$Rn)),
+def : Pat<(i64 (any_lrint f32:$Rn)),
(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (lrint f64:$Rn)),
+def : Pat<(i64 (any_lrint f64:$Rn)),
(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (llrint f32:$Rn)),
+def : Pat<(i64 (any_llrint f32:$Rn)),
(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (llrint f64:$Rn)),
+def : Pat<(i64 (any_llrint f64:$Rn)),
(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
//===----------------------------------------------------------------------===//
// Floating point two operand instructions.
//===----------------------------------------------------------------------===//
-defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
+defm FADD : TwoOperandFPData<0b0010, "fadd", any_fadd>;
let SchedRW = [WriteFDiv] in {
-defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+defm FDIV : TwoOperandFPData<0b0001, "fdiv", any_fdiv>;
}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
-defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaximum>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
-defm FMIN : TwoOperandFPData<0b0101, "fmin", fminimum>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", any_fmaxnum>;
+defm FMAX : TwoOperandFPData<0b0100, "fmax", any_fmaximum>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", any_fminnum>;
+defm FMIN : TwoOperandFPData<0b0101, "fmin", any_fminimum>;
let SchedRW = [WriteFMul] in {
-defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+defm FMUL : TwoOperandFPData<0b0000, "fmul", any_fmul>;
+defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
}
-defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
+defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>;
def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
@@ -4024,13 +4134,13 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
// Floating point three operand instructions.
//===----------------------------------------------------------------------===//
-defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMADD : ThreeOperandFPData<0, 0, "fmadd", any_fma>;
defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
- TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+ TriOpFrag<(any_fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
- TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+ TriOpFrag<(fneg (any_fma node:$LHS, node:$MHS, node:$RHS))> >;
defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
- TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+ TriOpFrag<(any_fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
// The following def pats catch the case where the LHS of an FMA is negated.
// The TriOpFrag above catches the case where the middle operand is negated.
@@ -4159,25 +4269,25 @@ def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(zext (v8i8 V64:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
- (zext (extract_high_v16i8 V128:$opB))))),
+def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
+ (zext (extract_high_v16i8 (v16i8 V128:$opB)))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
- (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
- (zext (extract_high_v16i8 V128:$opB))),
+ (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))),
+ (zext (extract_high_v16i8 (v16i8 V128:$opB)))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
(zext (v4i16 V64:$opB))))),
(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
- (zext (extract_high_v8i16 V128:$opB))))),
+def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))),
+ (zext (extract_high_v8i16 (v8i16 V128:$opB)))))),
(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
(zext (v2i32 V64:$opB))))),
(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
- (zext (extract_high_v4i32 V128:$opB))))),
+def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))),
+ (zext (extract_high_v4i32 (v4i32 V128:$opB)))))),
(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
@@ -4189,7 +4299,7 @@ defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+defm FABS : SIMDTwoVectorFPNoException<0, 1, 0b01111, "fabs", fabs>;
def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))),
(CMLTv8i8rz V64:$Rn)>;
@@ -4219,9 +4329,9 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
(i64 4)))),
(FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
@@ -4233,16 +4343,16 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
def : Pat<(concat_vectors V64:$Rd,
(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+def : Pat<(v2f32 (any_fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (any_fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (any_fpround (v2f64 V128:$Rn)))),
(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
int_aarch64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
// AArch64's FCVT instructions saturate when out of range.
multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
@@ -4272,15 +4382,15 @@ def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
-defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FNEG : SIMDTwoVectorFPNoException<1, 1, 0b01111, "fneg", fneg>;
defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", any_fround>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", any_fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", any_ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", any_froundeven>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", any_fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", any_frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", any_ftrunc>;
let Predicates = [HasFRInt3264] in {
defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>;
@@ -4290,7 +4400,7 @@ let Predicates = [HasFRInt3264] in {
} // HasFRInt3264
defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
-defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", any_fsqrt>;
defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
@@ -4312,9 +4422,9 @@ defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
- BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
-defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+ BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>;
+defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>;
defm SHLL : SIMDVectorLShiftLongBySizeBHS;
defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
@@ -4324,7 +4434,7 @@ defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd
defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
-defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>;
defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
@@ -4348,15 +4458,15 @@ def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
(SHLLv8i8 V64:$Rn)>;
- def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+ def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 (v16i8 V128:$Rn)))), (i32 8)),
(SHLLv16i8 V128:$Rn)>;
def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
(SHLLv4i16 V64:$Rn)>;
- def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+ def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 (v8i16 V128:$Rn)))), (i32 16)),
(SHLLv8i16 V128:$Rn)>;
def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
(SHLLv2i32 V64:$Rn)>;
- def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+ def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 (v4i32 V128:$Rn)))), (i32 32)),
(SHLLv4i32 V128:$Rn)>;
}
@@ -4426,7 +4536,7 @@ def : Pat<(v8i16 (concat_vectors
//===----------------------------------------------------------------------===//
defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", AArch64addp>;
defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
@@ -4447,33 +4557,33 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V
}
defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
-defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp", AArch64faddp>;
+defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>;
defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", any_fdiv>;
defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", any_fmaxnum>;
defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
+defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", any_fmaximum>;
defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>;
defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
+defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>;
// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
// instruction expects the addend first, while the fma intrinsic puts it last.
defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
- TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+ TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
- TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+ TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", any_fmul>;
defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
+defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", any_fsub>;
// MLA and MLS are generated in MachineCombine
defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
@@ -4484,7 +4594,7 @@ defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
-defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>;
+defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
@@ -4496,14 +4606,14 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd
defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
-defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
+defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>;
defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
-defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>;
+defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
@@ -4513,7 +4623,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
-defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
+defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
@@ -4753,11 +4863,13 @@ defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
-let Predicates = [HasFullFP16] in {
+let Predicates = [HasNEON, HasFullFP16] in {
def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
}
+let Predicates = [HasNEON] in {
def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
+}
defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
int_aarch64_neon_facge>;
defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
@@ -4765,9 +4877,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorStreamingSVE>;
-defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorStreamingSVE>;
-defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorStreamingSVE>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -4862,9 +4974,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorStreamingSVE>;
-defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorStreamingSVE>;
-defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorStreamingSVE>;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -4980,23 +5092,21 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
let Predicates = [HasNEON] in {
-def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
-def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
(SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
-def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
(UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
-def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
let Predicates = [HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
-def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
-}
-
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
@@ -5083,6 +5193,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
+} // let Predicates = [HasNEON]
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
@@ -5102,10 +5213,10 @@ defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
@@ -5123,10 +5234,10 @@ defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
@@ -5161,74 +5272,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat
V64:$Rn, V64:$Rm)), dsub)>;
}
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64umull,
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64smull,
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64umull,
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-// Additional patterns for SMULL and UMULL
-multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
- (INST8B V64:$Rn, V64:$Rm)>;
- def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
- (INST4H V64:$Rn, V64:$Rm)>;
- def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
- (INST2S V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
- SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
-defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
- UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
-
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm))),
- (INST8B V128:$Rn, V128:$Rm)>;
- def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm))),
- (INST4H V128:$Rn, V128:$Rm)>;
- def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm))),
- (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
- SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
- UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
-// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
-multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
- (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
- def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
- (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
- def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
- (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
- SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
- UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
- SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
- UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -5392,19 +5444,22 @@ defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
+// Only the lower half of the result of the inner FADDP is used in the patterns
+// below, so the second operand does not matter. Re-use the first input
+// operand, so no additional dependencies need to be introduced.
let Predicates = [HasFullFP16] in {
def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
(FADDPv2i16p
(EXTRACT_SUBREG
- (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+ (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn),
dsub))>;
def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
- (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+ (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>;
}
def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
(FADDPv2i32p
(EXTRACT_SUBREG
- (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+ (FADDPv4f32 V128:$Rn, V128:$Rn),
dsub))>;
def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
(FADDPv2i32p V64:$Rn)>;
@@ -5856,24 +5911,28 @@ defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
-// Patterns for uaddv(uaddlp(x)) ==> uaddlv
-def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
- (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
- (i64 0))), (i64 0))),
- (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
- (UADDLVv8i8v V64:$op), hsub), ssub)>;
-def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp
- (v16i8 V128:$op))))), (i64 0))),
- (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
- (UADDLVv16i8v V128:$op), hsub), ssub)>;
-def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>;
-
-// Patterns for addp(uaddlp(x))) ==> uaddlv
-def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))),
- (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>;
-def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))),
- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>;
+multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp> {
+ // Patterns for addv(addlp(x)) ==> addlv
+ def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
+ (v4i16 (AArch64uaddv (v4i16 (addlp (v8i8 V64:$op))))),
+ (i64 0))), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(Opc#"v8i8v") V64:$op), hsub), ssub)>;
+ def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (addlp (v16i8 V128:$op))))), (i64 0))),
+ (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+ (!cast<Instruction>(Opc#"v16i8v") V128:$op), hsub), ssub)>;
+ def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v8i16v") V128:$op), ssub)>;
+
+ // Patterns for addp(addlp(x))) ==> addlv
+ def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i16v") V64:$op), ssub)>;
+ def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i32v") V128:$op), dsub)>;
+}
+
+defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>;
+defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>;
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
@@ -6185,6 +6244,14 @@ def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+let Predicates = [HasNEON] in {
+ // Using the MOVI to materialize fp constants.
+ def : Pat<(f32 fpimm32SIMDModImmType4:$in),
+ (EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in),
+ (i32 24)),
+ ssub)>;
+}
+
def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -6273,18 +6340,18 @@ let hasSideEffects = 0 in {
// On the other hand, there are quite a few valid combinatorial options due to
// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
defm : SIMDFPIndexedTiedPatterns<"FMLA",
- TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+ TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>;
defm : SIMDFPIndexedTiedPatterns<"FMLA",
- TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+ TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
- TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+ TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
- TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+ TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
- TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+ TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
- TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+ TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
// 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
@@ -6363,22 +6430,22 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
}
defm : FMLSIndexedAfterNegPatterns<
- TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+ TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
defm : FMLSIndexedAfterNegPatterns<
- TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+ TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >;
defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
+defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>;
-def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv2i32_indexed V64:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
(i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv4i32_indexed V128:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
(i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
(FMULv2i64_indexed V128:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
(i64 0))>;
@@ -6397,11 +6464,10 @@ defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
- int_aarch64_neon_smull>;
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
@@ -6412,11 +6478,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
int_aarch64_neon_sqrdmlsh>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
- int_aarch64_neon_umull>;
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>;
// A scalar sqdmull with the second operand being a vector lane can be
// handled directly with the indexed instruction encoding.
@@ -6425,22 +6490,6 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
VectorIndexS:$idx)),
(SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
-// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
-// have no common bits.
-def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
- [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{
- if (N->getOpcode() == ISD::ADD)
- return true;
- return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1));
-}]> {
- let GISelPredicateCode = [{
- // Only handle G_ADD for now. FIXME. build capability to compute whether
- // operands of G_OR have common bits set or not.
- return MI.getOpcode() == TargetOpcode::G_ADD;
- }];
-}
-
-
//----------------------------------------------------------------------------
// AdvSIMD scalar shift instructions
//----------------------------------------------------------------------------
@@ -6480,7 +6529,7 @@ def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
(SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.
+// Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported.
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
@@ -6787,7 +6836,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
dsub)),
0),
ssub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -6807,7 +6856,8 @@ class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
INST,
hsub),
0),
- ssub)))>, Requires<[NotForCodeSize]>;
+ ssub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -6841,7 +6891,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
dsub)),
0),
dsub)))>,
- Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -6860,7 +6910,8 @@ class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
INST,
ssub),
0),
- dsub)))>, Requires<[NotForCodeSize]>;
+ dsub)))>,
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>;
def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
(LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
@@ -7216,14 +7267,6 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0
//----------------------------------------------------------------------------
// FIXME: Like for X86, these should go in their own separate .td file.
-def def32 : PatLeaf<(i32 GPR32:$src), [{
- return isDef32(*N);
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
// For an anyext, we don't care what the high bits are, so we can perform an
// INSERT_SUBREF into an IMPLICIT_DEF.
def : Pat<(i64 (anyext GPR32:$src)),
@@ -7387,99 +7430,16 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
//
// Natural vector casts (64 bit)
-def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in
+ foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in
+ def : Pat<(VT (AArch64NvCast (VT2 FPR64:$src))),
+ (VT FPR64:$src)>;
// Natural vector casts (128 bit)
-def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
-def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+ foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+ def : Pat<(VT (AArch64NvCast (VT2 FPR128:$src))),
+ (VT FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
@@ -8093,17 +8053,17 @@ defm : InsertSubvectorUndef<i64>;
def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
(vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
(i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
- (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+def : Pat<(f64 (any_fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+ (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
(f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
// vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
// so we match on v4f32 here, not v2f32. This will also catch adding
// the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
- (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+ (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
- (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
+ (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
(f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
// Scalar 64-bit shifts in FPR64 registers.
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 6aefc1fdb599..eaf39fc0dbb1 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -9,6 +9,12 @@
// This file contains a pass that performs load / store related peephole
// optimizations. This pass should be run after register allocation.
//
+// The pass runs after the PrologEpilogInserter where we emit the CFI
+// instructions. In order to preserve the correctness of the unwind informaiton,
+// the pass should not change the order of any two instructions, one of which
+// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
+// to unwind information.
+//
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
@@ -31,6 +37,7 @@
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -549,26 +556,6 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
}
}
-static bool isPairedLdSt(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- return false;
- case AArch64::LDPSi:
- case AArch64::LDPSWi:
- case AArch64::LDPDi:
- case AArch64::LDPQi:
- case AArch64::LDPWi:
- case AArch64::LDPXi:
- case AArch64::STPSi:
- case AArch64::STPDi:
- case AArch64::STPQi:
- case AArch64::STPWi:
- case AArch64::STPXi:
- case AArch64::STGPi:
- return true;
- }
-}
-
static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
unsigned OpcA = FirstMI.getOpcode();
@@ -603,7 +590,7 @@ static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
// Returns the scale and offset range of pre/post indexed variants of MI.
static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
int &MinOffset, int &MaxOffset) {
- bool IsPaired = isPairedLdSt(MI);
+ bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
bool IsTagStore = isTagStore(MI);
// ST*G and all paired ldst have the same scale in pre/post-indexed variants
// as in the "unsigned offset" variant.
@@ -625,17 +612,8 @@ static MachineOperand &getLdStRegOp(MachineInstr &MI,
bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
if (IsPreLdSt)
PairedRegOp += 1;
- unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
- return MI.getOperand(Idx);
-}
-
-static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
- unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1;
- return MI.getOperand(Idx);
-}
-
-static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
- unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2;
+ unsigned Idx =
+ AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
return MI.getOperand(Idx);
}
@@ -645,12 +623,14 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
int LoadSize = TII->getMemScale(LoadInst);
int StoreSize = TII->getMemScale(StoreInst);
- int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst)
- ? getLdStOffsetOp(StoreInst).getImm()
- : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
- int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst)
- ? getLdStOffsetOp(LoadInst).getImm()
- : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ int UnscaledStOffset =
+ TII->hasUnscaledLdStOffset(StoreInst)
+ ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset =
+ TII->hasUnscaledLdStOffset(LoadInst)
+ ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
return (UnscaledStOffset <= UnscaledLdOffset) &&
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
}
@@ -729,7 +709,7 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
case AArch64::STPWi:
case AArch64::STPXi:
// Make sure this is a reg+imm (as opposed to an address reloc).
- if (!getLdStOffsetOp(MI).isImm())
+ if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
return false;
return true;
@@ -763,17 +743,18 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
// Also based on MergeForward is from where we copy the base register operand
// so we get the flags compatible with the input code.
const MachineOperand &BaseRegOp =
- MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
+ MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
+ : AArch64InstrInfo::getLdStBaseOp(*I);
// Which register is Rt and which is Rt2 depends on the offset order.
MachineInstr *RtMI;
- if (getLdStOffsetOp(*I).getImm() ==
- getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
+ if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() ==
+ AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
RtMI = &*MergeMI;
else
RtMI = &*I;
- int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
// Change the scaled offset from small to large type.
if (IsScaled) {
assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
@@ -923,6 +904,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
assert(all_of(MI.operands(),
[this, &RenameReg](const MachineOperand &MOP) {
return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
+ MOP.isUndef() ||
!TRI->regsOverlap(MOP.getReg(), *RenameReg);
}) &&
"Rename register used between paired instruction, trashing the "
@@ -936,10 +918,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// Also based on MergeForward is from where we copy the base register operand
// so we get the flags compatible with the input code.
const MachineOperand &BaseRegOp =
- MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+ MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
+ : AArch64InstrInfo::getLdStBaseOp(*I);
- int Offset = getLdStOffsetOp(*I).getImm();
- int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+ int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
+ int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
if (IsUnscaled != PairedIsUnscaled) {
// We're trying to pair instructions that differ in how they are scaled. If
@@ -974,7 +957,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
RtMI = &*I;
Rt2MI = &*Paired;
}
- int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+ int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
// Scale the immediate offset, if necessary.
if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
@@ -1132,12 +1115,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
"Unsupported ld/st match");
assert(LoadSize <= StoreSize && "Invalid load size");
- int UnscaledLdOffset = IsUnscaled
- ? getLdStOffsetOp(*LoadI).getImm()
- : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
- int UnscaledStOffset = IsUnscaled
- ? getLdStOffsetOp(*StoreI).getImm()
- : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
+ int UnscaledLdOffset =
+ IsUnscaled
+ ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
+ int UnscaledStOffset =
+ IsUnscaled
+ ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
int Width = LoadSize * 8;
Register DestReg =
IsStoreXReg ? Register(TRI->getMatchingSuperReg(
@@ -1235,7 +1220,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator MBBI = I;
MachineInstr &LoadMI = *I;
- Register BaseReg = getLdStBaseOp(LoadMI).getReg();
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
// If the load is the first instruction in the block, there's obviously
// not any matching store.
@@ -1264,7 +1249,8 @@ bool AArch64LoadStoreOpt::findMatchingStore(
// Also we can't handle stores without an immediate offset operand,
// while the operand might be the address for a global variable.
if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
- BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
+ BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
+ AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
StoreI = MBBI;
@@ -1467,18 +1453,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
return true;
}
-// Check if we can find a physical register for renaming. This register must:
-// * not be defined up to FirstMI (checking DefinedInBB)
-// * not used between the MI and the defining instruction of the register to
-// rename (checked using UsedInBetween).
+// Check if we can find a physical register for renaming \p Reg. This register
+// must:
+// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
+// defined registers up to the point where the renamed register will be used,
+// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
+// registers in the range the rename register will be used,
// * is available in all used register classes (checked using RequiredClasses).
static Optional<MCPhysReg> tryToFindRegisterToRename(
- MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB,
+ const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
LiveRegUnits &UsedInBetween,
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
const TargetRegisterInfo *TRI) {
- auto &MF = *FirstMI.getParent()->getParent();
- MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const MachineRegisterInfo &RegInfo = MF.getRegInfo();
// Checks if any sub- or super-register of PR is callee saved.
auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
@@ -1499,7 +1486,7 @@ static Optional<MCPhysReg> tryToFindRegisterToRename(
});
};
- auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
+ auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
for (const MCPhysReg &PR : *RegClass) {
if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
!RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
@@ -1530,8 +1517,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
bool MayLoad = FirstMI.mayLoad();
bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
Register Reg = getLdStRegOp(FirstMI).getReg();
- Register BaseReg = getLdStBaseOp(FirstMI).getReg();
- int Offset = getLdStOffsetOp(FirstMI).getImm();
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
+ int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
@@ -1566,7 +1553,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
Flags.setSExtIdx(-1);
if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
- getLdStOffsetOp(MI).isImm()) {
+ AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
assert(MI.mayLoadOrStore() && "Expected memory operation.");
// If we've found another instruction with the same opcode, check to see
// if the base and offset are compatible with our starting instruction.
@@ -1574,8 +1561,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// check for +1/-1. Make sure to check the new instruction offset is
// actually an immediate and not a symbolic reference destined for
// a relocation.
- Register MIBaseReg = getLdStBaseOp(MI).getReg();
- int MIOffset = getLdStOffsetOp(MI).getImm();
+ Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
+ int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
if (IsUnscaled != MIIsUnscaled) {
// We're trying to pair instructions that differ in how they are scaled.
@@ -1606,15 +1593,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// can't be paired: bail and keep looking.
if (IsPreLdSt) {
bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
- bool IsBaseRegUsed =
- !UsedRegUnits.available(getLdStBaseOp(MI).getReg());
- bool IsBaseRegModified =
- !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg());
+ bool IsBaseRegUsed = !UsedRegUnits.available(
+ AArch64InstrInfo::getLdStBaseOp(MI).getReg());
+ bool IsBaseRegModified = !ModifiedRegUnits.available(
+ AArch64InstrInfo::getLdStBaseOp(MI).getReg());
// If the stored value and the address of the second instruction is
// the same, it needs to be using the updated register and therefore
// it must not be folded.
- bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
- getLdStBaseOp(MI).getReg());
+ bool IsMIRegTheSame =
+ TRI->regsOverlap(getLdStRegOp(MI).getReg(),
+ AArch64InstrInfo::getLdStBaseOp(MI).getReg());
if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
IsMIRegTheSame) {
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
@@ -1722,8 +1710,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
if (*MaybeCanRename) {
Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
- FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
- TRI);
+ *FirstMI.getParent()->getParent(), Reg, DefinedInBB,
+ UsedInBetween, RequiredClasses, TRI);
if (MaybeRenameReg) {
Flags.setRenameReg(*MaybeRenameReg);
Flags.setMergeForward(true);
@@ -1760,6 +1748,28 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
return E;
}
+static MachineBasicBlock::iterator
+maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
+ auto End = MI.getParent()->end();
+ if (MaybeCFI == End ||
+ MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
+ !(MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy)) ||
+ AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
+ return End;
+
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
+ const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
+ switch (CFI.getOperation()) {
+ case MCCFIInstruction::OpDefCfa:
+ case MCCFIInstruction::OpDefCfaOffset:
+ return MaybeCFI;
+ default:
+ return End;
+ }
+}
+
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update,
@@ -1769,6 +1779,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
"Unexpected base register update instruction to merge!");
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+
+ // If updating the SP and the following instruction is CFA offset related CFI
+ // instruction move it after the merged instruction.
+ MachineBasicBlock::iterator CFI =
+ IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
+
// Return the instruction following the merged instruction, which is
// the instruction following our unmerged load. Unless that's the add/sub
// instruction we're merging, in which case it's the one after that.
@@ -1786,12 +1802,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineInstrBuilder MIB;
int Scale, MinOffset, MaxOffset;
getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
- if (!isPairedLdSt(*I)) {
+ if (!AArch64InstrInfo::isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I))
- .add(getLdStBaseOp(*I))
+ .add(AArch64InstrInfo::getLdStBaseOp(*I))
.addImm(Value / Scale)
.setMemRefs(I->memoperands())
.setMIFlags(I->mergeFlagsWith(*Update));
@@ -1801,12 +1817,15 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I, 0))
.add(getLdStRegOp(*I, 1))
- .add(getLdStBaseOp(*I))
+ .add(AArch64InstrInfo::getLdStBaseOp(*I))
.addImm(Value / Scale)
.setMemRefs(I->memoperands())
.setMIFlags(I->mergeFlagsWith(*Update));
}
- (void)MIB;
+ if (CFI != E) {
+ MachineBasicBlock *MBB = I->getParent();
+ MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
+ }
if (IsPreIdx) {
++NumPreFolded;
@@ -1888,8 +1907,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
- Register BaseReg = getLdStBaseOp(MemMI).getReg();
- int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+ int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
+ TII->getMemScale(MemMI);
// Scan forward looking for post-index opportunities. Updating instructions
// can't be formed if the memory instruction doesn't have the offset we're
@@ -1904,7 +1924,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// behavior in this case unlike normal stores, and always performs writeback
// after reading the source register value.
if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
- bool IsPairedInsn = isPairedLdSt(MemMI);
+ bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
Register DestReg = getLdStRegOp(MemMI, i).getReg();
if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
@@ -1965,8 +1985,8 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineBasicBlock::iterator MBBI = I;
MachineFunction &MF = *MemMI.getMF();
- Register BaseReg = getLdStBaseOp(MemMI).getReg();
- int Offset = getLdStOffsetOp(MemMI).getImm();
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+ int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
// If the load/store is the first instruction in the block, there's obviously
// not any matching update. Ditto if the memory offset isn't zero.
@@ -1975,7 +1995,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
// If the base register overlaps a destination register, we can't
// merge the update.
if (!isTagStore(MemMI)) {
- bool IsPairedInsn = isPairedLdSt(MemMI);
+ bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
Register DestReg = getLdStRegOp(MemMI, i).getReg();
if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
@@ -2045,7 +2065,7 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
// Make sure this is a reg+imm.
// FIXME: It is possible to extend it to handle reg+reg cases.
- if (!getLdStOffsetOp(MI).isImm())
+ if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
return false;
// Look backward up to LdStLimit instructions.
@@ -2099,7 +2119,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
// range, plus allow an extra one in case we find a later insn that matches
// with Offset-1)
bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
- int Offset = getLdStOffsetOp(MI).getImm();
+ int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
// Allow one more for offset.
if (Offset > 0)
@@ -2166,7 +2186,8 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
// The immediate in the load/store is scaled by the size of the memory
// operation. The immediate in the add we're looking for,
// however, is not, so adjust here.
- int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
+ int UnscaledOffset =
+ AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
// Look forward to try to find a pre-index instruction. For example,
// ldr x1, [x0, #64]
@@ -2268,7 +2289,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
- Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
TRI = Subtarget->getRegisterInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 1fc5617b49f6..5c7fb0deecd0 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -60,12 +60,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
MachineLoopInfo *MLI;
MachineRegisterInfo *MRI;
+ using OpcodePair = std::pair<unsigned, unsigned>;
template <typename T>
using SplitAndOpcFunc =
- std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
+ std::function<Optional<OpcodePair>(T, unsigned, T &, T &)>;
using BuildMIFunc =
- std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
- Register, Register)>;
+ std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
+ Register, Register, Register)>;
/// For instructions where an immediate operand could be split into two
/// separate immediate instructions, use the splitTwoPartImm two handle the
@@ -83,20 +84,19 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
/// %dst = <Instr>ri %tmp (encode half IMM) [...]
template <typename T>
bool splitTwoPartImm(MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
MachineInstr *&SubregToRegMI);
template <typename T>
- bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+ bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
- bool visitORR(MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+ bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+
+ template <typename T>
+ bool visitAND(unsigned Opc, MachineInstr &MI);
+ bool visitORR(MachineInstr &MI);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -157,8 +157,7 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
template <typename T>
bool AArch64MIPeepholeOpt::visitAND(
- unsigned Opc, MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+ unsigned Opc, MachineInstr &MI) {
// Try below transformation.
//
// MOVi32imm + ANDWrr ==> ANDWri + ANDWri
@@ -170,28 +169,27 @@ bool AArch64MIPeepholeOpt::visitAND(
// mov + and instructions.
return splitTwoPartImm<T>(
- MI, ToBeRemoved,
- [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
+ MI,
+ [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
- return Opc;
+ return std::make_pair(Opc, Opc);
return None;
},
- [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+ [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
unsigned Imm1, Register SrcReg, Register NewTmpReg,
Register NewDstReg) {
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock *MBB = MI.getParent();
- BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
.addReg(SrcReg)
.addImm(Imm0);
- BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
.addReg(NewTmpReg)
.addImm(Imm1);
});
}
-bool AArch64MIPeepholeOpt::visitORR(
- MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
// Check this ORR comes from below zero-extend pattern.
//
// def : Pat<(i64 (zext GPR32:$src)),
@@ -216,19 +214,38 @@ bool AArch64MIPeepholeOpt::visitORR(
// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
// real AArch64 instruction and if it is not, do not process the opcode
// conservatively.
- if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+ if (SrcMI->getOpcode() == TargetOpcode::COPY &&
+ SrcMI->getOperand(1).getReg().isVirtual()) {
+ const TargetRegisterClass *RC =
+ MRI->getRegClass(SrcMI->getOperand(1).getReg());
+
+ // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
+ // that the upper bits are zero.
+ if (RC != &AArch64::FPR32RegClass &&
+ ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
+ SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
+ return false;
+ Register CpySrc = SrcMI->getOperand(1).getReg();
+ if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
+ CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
+ BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), CpySrc)
+ .add(SrcMI->getOperand(1));
+ }
+ BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+ TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
+ .addReg(CpySrc);
+ SrcMI->eraseFromParent();
+ }
+ else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
return false;
Register DefReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(2).getReg();
MRI->replaceRegWith(DefReg, SrcReg);
MRI->clearKillFlags(SrcReg);
- // replaceRegWith changes MI's definition register. Keep it for SSA form until
- // deleting MI.
- MI.getOperand(0).setReg(DefReg);
- ToBeRemoved.insert(&MI);
-
LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
+ MI.eraseFromParent();
return true;
}
@@ -255,8 +272,7 @@ static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
template <typename T>
bool AArch64MIPeepholeOpt::visitADDSUB(
- unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
- SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+ unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
// Try below transformation.
//
// MOVi32imm + ADDWrr ==> ADDWri + ADDWri
@@ -271,25 +287,65 @@ bool AArch64MIPeepholeOpt::visitADDSUB(
// multiple `mov` + `and/sub` instructions.
return splitTwoPartImm<T>(
- MI, ToBeRemoved,
+ MI,
[PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> Optional<unsigned> {
+ T &Imm1) -> Optional<OpcodePair> {
if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
- return PosOpc;
+ return std::make_pair(PosOpc, PosOpc);
if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
- return NegOpc;
+ return std::make_pair(NegOpc, NegOpc);
return None;
},
- [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+ [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
unsigned Imm1, Register SrcReg, Register NewTmpReg,
Register NewDstReg) {
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock *MBB = MI.getParent();
- BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
.addReg(SrcReg)
.addImm(Imm0)
.addImm(12);
- BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
+ .addReg(NewTmpReg)
+ .addImm(Imm1)
+ .addImm(0);
+ });
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitADDSSUBS(
+ OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
+ // Try the same transformation as ADDSUB but with additional requirement
+ // that the condition code usages are only for Equal and Not Equal
+ return splitTwoPartImm<T>(
+ MI,
+ [PosOpcs, NegOpcs, &MI, &TRI = TRI, &MRI = MRI](
+ T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
+ OpcodePair OP;
+ if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
+ OP = PosOpcs;
+ else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
+ OP = NegOpcs;
+ else
+ return None;
+ // Check conditional uses last since it is expensive for scanning
+ // proceeding instructions
+ MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+ Optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
+ if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
+ return None;
+ return OP;
+ },
+ [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
+ unsigned Imm1, Register SrcReg, Register NewTmpReg,
+ Register NewDstReg) {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
+ .addReg(SrcReg)
+ .addImm(Imm0)
+ .addImm(12);
+ BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
.addReg(NewTmpReg)
.addImm(Imm1)
.addImm(0);
@@ -338,7 +394,7 @@ bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
template <typename T>
bool AArch64MIPeepholeOpt::splitTwoPartImm(
- MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+ MachineInstr &MI,
SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
unsigned RegSize = sizeof(T) * 8;
assert((RegSize == 32 || RegSize == 64) &&
@@ -357,39 +413,63 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
// number since it was sign extended when we assign to the 64-bit Imm.
if (SubregToRegMI)
Imm &= 0xFFFFFFFF;
- unsigned Opcode;
+ OpcodePair Opcode;
if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
- Opcode = R.getValue();
+ Opcode = *R;
else
return false;
- // Create new ADD/SUB MIs.
+ // Create new MIs using the first and second opcodes. Opcodes might differ for
+ // flag setting operations that should only set flags on second instruction.
+ // NewTmpReg = Opcode.first SrcReg Imm0
+ // NewDstReg = Opcode.second NewTmpReg Imm1
+
+ // Determine register classes for destinations and register operands
MachineFunction *MF = MI.getMF();
- const TargetRegisterClass *RC =
- TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
- const TargetRegisterClass *ORC =
- TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
+ const TargetRegisterClass *FirstInstrDstRC =
+ TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
+ const TargetRegisterClass *FirstInstrOperandRC =
+ TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
+ const TargetRegisterClass *SecondInstrDstRC =
+ (Opcode.first == Opcode.second)
+ ? FirstInstrDstRC
+ : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
+ const TargetRegisterClass *SecondInstrOperandRC =
+ (Opcode.first == Opcode.second)
+ ? FirstInstrOperandRC
+ : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
+
+ // Get old registers destinations and new register destinations
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
- Register NewTmpReg = MRI->createVirtualRegister(RC);
- Register NewDstReg = MRI->createVirtualRegister(RC);
-
- MRI->constrainRegClass(SrcReg, RC);
- MRI->constrainRegClass(NewTmpReg, ORC);
- MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-
+ Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
+ // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
+ // reuse that same destination register.
+ Register NewDstReg = DstReg.isVirtual()
+ ? MRI->createVirtualRegister(SecondInstrDstRC)
+ : DstReg;
+
+ // Constrain registers based on their new uses
+ MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
+ MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
+ if (DstReg != NewDstReg)
+ MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
+
+ // Call the delegating operation to build the instruction
BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
- MRI->replaceRegWith(DstReg, NewDstReg);
// replaceRegWith changes MI's definition register. Keep it for SSA form until
- // deleting MI.
- MI.getOperand(0).setReg(DstReg);
+ // deleting MI. Only if we made a new destination register.
+ if (DstReg != NewDstReg) {
+ MRI->replaceRegWith(DstReg, NewDstReg);
+ MI.getOperand(0).setReg(DstReg);
+ }
// Record the MIs need to be removed.
- ToBeRemoved.insert(&MI);
+ MI.eraseFromParent();
if (SubregToRegMI)
- ToBeRemoved.insert(SubregToRegMI);
- ToBeRemoved.insert(MovMI);
+ SubregToRegMI->eraseFromParent();
+ MovMI->eraseFromParent();
return true;
}
@@ -407,45 +487,57 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
assert(MRI->isSSA() && "Expected to be run on SSA form!");
bool Changed = false;
- SmallSetVector<MachineInstr *, 8> ToBeRemoved;
for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
switch (MI.getOpcode()) {
default:
break;
case AArch64::ANDWrr:
- Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
+ Changed = visitAND<uint32_t>(AArch64::ANDWri, MI);
break;
case AArch64::ANDXrr:
- Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
+ Changed = visitAND<uint64_t>(AArch64::ANDXri, MI);
break;
case AArch64::ORRWrs:
- Changed = visitORR(MI, ToBeRemoved);
+ Changed = visitORR(MI);
break;
case AArch64::ADDWrr:
- Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
- ToBeRemoved);
+ Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
break;
case AArch64::SUBWrr:
- Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
- ToBeRemoved);
+ Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
break;
case AArch64::ADDXrr:
- Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
- ToBeRemoved);
+ Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
break;
case AArch64::SUBXrr:
- Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
- ToBeRemoved);
+ Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
+ break;
+ case AArch64::ADDSWrr:
+ Changed = visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
+ {AArch64::SUBWri, AArch64::SUBSWri},
+ MI);
+ break;
+ case AArch64::SUBSWrr:
+ Changed = visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
+ {AArch64::ADDWri, AArch64::ADDSWri},
+ MI);
+ break;
+ case AArch64::ADDSXrr:
+ Changed = visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
+ {AArch64::SUBXri, AArch64::SUBSXri},
+ MI);
+ break;
+ case AArch64::SUBSXrr:
+ Changed = visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
+ {AArch64::ADDXri, AArch64::ADDSXri},
+ MI);
break;
}
}
}
- for (MachineInstr *MI : ToBeRemoved)
- MI->eraseFromParent();
-
return Changed;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 6950675c5d53..a2ab2b855d80 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -15,8 +15,11 @@
#include "AArch64MachineFunctionInfo.h"
#include "AArch64InstrInfo.h"
-#include <llvm/IR/Metadata.h>
-#include <llvm/IR/Module.h>
+#include "AArch64Subtarget.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
@@ -30,7 +33,7 @@ void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
void AArch64FunctionInfo::initializeBaseYamlFields(
const yaml::AArch64FunctionInfo &YamlMFI) {
- if (YamlMFI.HasRedZone.hasValue())
+ if (YamlMFI.HasRedZone)
HasRedZone = YamlMFI.HasRedZone;
}
@@ -77,15 +80,17 @@ static bool ShouldSignWithBKey(const Function &F) {
return Key.equals_insensitive("b_key");
}
-AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
+AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF_) : MF(&MF_) {
// If we already know that the function doesn't have a redzone, set
// HasRedZone here.
- if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+ if (MF->getFunction().hasFnAttribute(Attribute::NoRedZone))
HasRedZone = false;
- const Function &F = MF.getFunction();
+ const Function &F = MF->getFunction();
std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
SignWithBKey = ShouldSignWithBKey(F);
+ // TODO: skip functions that have no instrumented allocas for optimization
+ IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
if (!F.hasFnAttribute("branch-target-enforcement")) {
if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
@@ -101,6 +106,15 @@ AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
}
+MachineFunctionInfo *AArch64FunctionInfo::clone(
+ BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const {
+ AArch64FunctionInfo *InfoClone = DestMF.cloneInfo<AArch64FunctionInfo>(*this);
+ InfoClone->MF = &DestMF;
+ return InfoClone;
+}
+
bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
if (!SignReturnAddress)
return false;
@@ -111,6 +125,27 @@ bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
bool AArch64FunctionInfo::shouldSignReturnAddress() const {
return shouldSignReturnAddress(llvm::any_of(
- MF.getFrameInfo().getCalleeSavedInfo(),
+ MF->getFrameInfo().getCalleeSavedInfo(),
[](const auto &Info) { return Info.getReg() == AArch64::LR; }));
}
+
+bool AArch64FunctionInfo::needsDwarfUnwindInfo() const {
+ if (!NeedsDwarfUnwindInfo)
+ NeedsDwarfUnwindInfo = MF->needsFrameMoves() &&
+ !MF->getTarget().getMCAsmInfo()->usesWindowsCFI();
+
+ return *NeedsDwarfUnwindInfo;
+}
+
+bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo() const {
+ if (!NeedsAsyncDwarfUnwindInfo) {
+ const Function &F = MF->getFunction();
+ // The check got "minsize" is because epilogue unwind info is not emitted
+ // (yet) for homogeneous epilogues, outlined functions, and functions
+ // outlined from.
+ NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo() &&
+ F.getUWTableKind() == UWTableKind::Async &&
+ !F.hasMinSize();
+ }
+ return *NeedsAsyncDwarfUnwindInfo;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index e5e08e6c00d6..f070f989a5b7 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
@@ -36,7 +37,7 @@ class MachineInstr;
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
/// Backreference to the machine function.
- MachineFunction &MF;
+ MachineFunction *MF;
/// Number of bytes of arguments this function has on the stack. If the callee
/// is expected to restore the argument stack this should be a multiple of 16,
@@ -115,7 +116,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// SRetReturnReg - sret lowering includes returning the value of the
/// returned struct in a register. This field holds the virtual register into
/// which the sret argument is passed.
- unsigned SRetReturnReg = 0;
+ Register SRetReturnReg;
+
/// SVE stack size (for predicates and data vectors) are maintained here
/// rather than in FrameInfo, as the placement and Stack IDs are target
/// specific.
@@ -173,9 +175,29 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// The stack slot where the Swift asynchronous context is stored.
int SwiftAsyncContextFrameIdx = std::numeric_limits<int>::max();
+ bool IsMTETagged = false;
+
+ /// The function has Scalable Vector or Scalable Predicate register argument
+ /// or return type
+ bool IsSVECC = false;
+
+ /// True if the function need unwind information.
+ mutable Optional<bool> NeedsDwarfUnwindInfo;
+
+ /// True if the function need asynchronous unwind information.
+ mutable Optional<bool> NeedsAsyncDwarfUnwindInfo;
+
public:
explicit AArch64FunctionInfo(MachineFunction &MF);
+ MachineFunctionInfo *
+ clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const override;
+
+ bool isSVECC() const { return IsSVECC; };
+ void setIsSVECC(bool s) { IsSVECC = s; };
+
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -395,6 +417,7 @@ public:
bool shouldSignReturnAddress(bool SpillsLR) const;
bool shouldSignWithBKey() const { return SignWithBKey; }
+ bool isMTETagged() const { return IsMTETagged; }
bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
@@ -408,6 +431,9 @@ public:
}
int getSwiftAsyncContextFrameIdx() const { return SwiftAsyncContextFrameIdx; }
+ bool needsDwarfUnwindInfo() const;
+ bool needsAsyncDwarfUnwindInfo() const;
+
private:
// Hold the lists of LOHs.
MILOHContainer LOHContainerSet;
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
new file mode 100644
index 000000000000..6c8845ee8598
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
@@ -0,0 +1,82 @@
+//===- AArch64MachineScheduler.cpp - MI Scheduler for AArch64 -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MachineScheduler.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+
+using namespace llvm;
+
+static bool needReorderStoreMI(const MachineInstr *MI) {
+ if (!MI)
+ return false;
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STURQi:
+ case AArch64::STRQui:
+ if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
+ return false;
+ LLVM_FALLTHROUGH;
+ case AArch64::STPQi:
+ return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm();
+ }
+
+ return false;
+}
+
+// Return true if two stores with same base address may overlap writes
+static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1,
+ int64_t &Off0, int64_t &Off1) {
+ const MachineOperand &Base0 = AArch64InstrInfo::getLdStBaseOp(MI0);
+ const MachineOperand &Base1 = AArch64InstrInfo::getLdStBaseOp(MI1);
+
+ // May overlapping writes if two store instructions without same base
+ if (!Base0.isIdenticalTo(Base1))
+ return true;
+
+ int StoreSize0 = AArch64InstrInfo::getMemScale(MI0);
+ int StoreSize1 = AArch64InstrInfo::getMemScale(MI1);
+ Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode())
+ ? AArch64InstrInfo::getLdStOffsetOp(MI0).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() * StoreSize0;
+ Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode())
+ ? AArch64InstrInfo::getLdStOffsetOp(MI1).getImm()
+ : AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() * StoreSize1;
+
+ const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1;
+ int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1;
+ int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples;
+
+ return llabs(Off0 - Off1) < StoreSize;
+}
+
+bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand) {
+ bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand);
+
+ if (Cand.isValid()) {
+ MachineInstr *Instr0 = TryCand.SU->getInstr();
+ MachineInstr *Instr1 = Cand.SU->getInstr();
+
+ if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1))
+ return OriginalResult;
+
+ int64_t Off0, Off1;
+ // With the same base address and non-overlapping writes.
+ if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) {
+ TryCand.Reason = NodeOrder;
+ // Order them by ascending offsets.
+ return Off0 < Off1;
+ }
+ }
+
+ return OriginalResult;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.h b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h
new file mode 100644
index 000000000000..23df015986d1
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h
@@ -0,0 +1,33 @@
+//===- AArch64MachineScheduler.h - Custom AArch64 MI scheduler --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom AArch64 MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// A MachineSchedStrategy implementation for AArch64 post RA scheduling.
+class AArch64PostRASchedStrategy : public PostGenericScheduler {
+public:
+ AArch64PostRASchedStrategy(const MachineSchedContext *C) :
+ PostGenericScheduler(C) {}
+
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index e8217eaf6ed5..c7657f37d16d 100644
--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -157,16 +157,19 @@ static bool isCryptoEORPair(const MachineInstr *FirstMI,
return false;
}
-/// Literal generation.
-static bool isLiteralsPair(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
+static bool isAdrpAddPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
// Assume the 1st instr to be a wildcard if it is unspecified.
-
- // PC relative address.
if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::ADRP) &&
SecondMI.getOpcode() == AArch64::ADDXri)
return true;
+ return false;
+}
+/// Literal generation.
+static bool isLiteralsPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
// 32 bit immediate.
if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZWi) &&
(SecondMI.getOpcode() == AArch64::MOVKWi &&
@@ -397,6 +400,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
return true;
if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstMI, SecondMI))
return true;
+ if (ST.hasFuseAdrpAdd() && isAdrpAddPair(FirstMI, SecondMI))
+ return true;
if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
return true;
if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index f443cd03935c..4555f1a3ebb0 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -14,6577 +14,6608 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#include "llvm/ADT/ArrayRef.h"
+
// 31 entries have cost 0
-// 242 entries have cost 1
-// 1447 entries have cost 2
-// 3602 entries have cost 3
-// 1237 entries have cost 4
-// 2 entries have cost 5
+// 756 entries have cost 1
+// 3690 entries have cost 2
+// 2084 entries have cost 3
// This table is 6561*4 = 26244 bytes in size.
-static const unsigned PerfectShuffleTable[6561+1] = {
- 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
- 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
- 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
- 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
- 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
- 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
- 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
- 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
- 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
- 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
- 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
- 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
- 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
- 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
- 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
- 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
- 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
- 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
- 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
- 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
- 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
- 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
- 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
- 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
- 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
- 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
- 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
- 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
- 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
- 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
- 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
- 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
- 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
- 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
- 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
- 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
- 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
- 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
- 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
- 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
- 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
- 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
- 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
- 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
- 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
- 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
- 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
- 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
- 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
- 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
- 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
- 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
- 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
- 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
- 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
- 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
- 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
- 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
- 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
- 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
- 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
- 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
- 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
- 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
- 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
- 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
- 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
- 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
- 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
- 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
- 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
- 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
- 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
- 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
- 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
- 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
- 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
- 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
- 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
- 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
- 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
- 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
- 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
- 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
- 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
- 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
- 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
- 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
- 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
- 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
- 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
- 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
- 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
- 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
- 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
- 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
- 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
- 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
- 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
- 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
- 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
- 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
- 835584U, // <0,1,2,3>: Cost 0 copy LHS
- 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
- 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
- 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
- 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
- 835584U, // <0,1,2,u>: Cost 0 copy LHS
- 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
- 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
- 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
- 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
- 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
- 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
- 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
- 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
- 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
- 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
- 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
- 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
- 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
- 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
- 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
- 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
- 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
- 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
- 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
- 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
- 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
- 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
- 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
- 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
- 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
- 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
- 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
- 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
- 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
- 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
- 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
- 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
- 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
- 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
- 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
- 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
- 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
- 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
- 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
- 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
- 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
- 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
- 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
- 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
- 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
- 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
- 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
- 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
- 835584U, // <0,1,u,3>: Cost 0 copy LHS
- 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
- 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
- 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
- 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
- 835584U, // <0,1,u,u>: Cost 0 copy LHS
- 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
- 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
- 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
- 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
- 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
- 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
- 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
- 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
- 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
- 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
- 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
- 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
- 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
- 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
- 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
- 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
- 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
- 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
- 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
- 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
- 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
- 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
- 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
- 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
- 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
- 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
- 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
- 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
- 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
- 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
- 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
- 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
- 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
- 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
- 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
- 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
- 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
- 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
- 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
- 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
- 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
- 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
- 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
- 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
- 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
- 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
- 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
- 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
- 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
- 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
- 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
- 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
- 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
- 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
- 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
- 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
- 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
- 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
- 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
- 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
- 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
- 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
- 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
- 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
- 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
- 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
- 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
- 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
- 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
- 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
- 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
- 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
- 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
- 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
- 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
- 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
- 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
- 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
- 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
- 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
- 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
- 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
- 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
- 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
- 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
- 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
- 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
- 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
- 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
- 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
- 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
- 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
- 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
- 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
- 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
- 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
- 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
- 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
- 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
- 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
- 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
- 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
- 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
- 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
- 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
- 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
- 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
- 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
- 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
- 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
- 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
- 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
- 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
- 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
- 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
- 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
- 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
- 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
- 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
- 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
- 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
- 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
- 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
- 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
- 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
- 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
- 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
- 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
- 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
- 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
- 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
- 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
- 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
- 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
- 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
- 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
- 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
- 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
- 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
- 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
- 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
- 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
- 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
- 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
- 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
- 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
- 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
- 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
- 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
- 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
- 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
- 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
- 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
- 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
- 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
- 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
- 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
- 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
- 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
- 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
- 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
- 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
- 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
- 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
- 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
- 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
- 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
- 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
- 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
- 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
- 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
- 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
- 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
- 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
- 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
- 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
- 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
- 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
- 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
- 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
- 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
- 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
- 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
- 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
- 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
- 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
- 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
- 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
- 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
- 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
- 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
- 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
- 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
- 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
- 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
- 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
- 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
- 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
- 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
- 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
- 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
- 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
- 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
- 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
- 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
- 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
- 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
- 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
- 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
- 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
- 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
- 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
- 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
- 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
- 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
- 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
- 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
- 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
- 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
- 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
- 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
- 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
- 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
- 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
- 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
- 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
- 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
- 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
- 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
- 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
- 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
- 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
- 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
- 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
- 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
- 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
- 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
- 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
- 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
- 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
- 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
- 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
- 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
- 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
- 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
- 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
- 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
- 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
- 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
- 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
- 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
- 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
- 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
- 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
- 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
- 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
- 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
- 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
- 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
- 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
- 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
- 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
- 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
- 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
- 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
- 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
- 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
- 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
- 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
- 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
- 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
- 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
- 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
- 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
- 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
- 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
- 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
- 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
- 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
- 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
- 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
- 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
- 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
- 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
- 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
- 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
- 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
- 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
- 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
- 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
- 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
- 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
- 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
- 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
- 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
- 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
- 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
- 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
- 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
- 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
- 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
- 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
- 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
- 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
- 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
- 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
- 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
- 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
- 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
- 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
- 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
- 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
- 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
- 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
- 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
- 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
- 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
- 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
- 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
- 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
- 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
- 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
- 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
- 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
- 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
- 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
- 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
- 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
- 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
- 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
- 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
- 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
- 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
- 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
- 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
- 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
- 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
- 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
- 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
- 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
- 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
- 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
- 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
- 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
- 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
- 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
- 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
- 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
- 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
- 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
- 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
- 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
- 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
- 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
- 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
- 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
- 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
- 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
- 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
- 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
- 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
- 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
- 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
- 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
- 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
- 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
- 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
- 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
- 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
- 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
- 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
- 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
- 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
- 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
- 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
- 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
- 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
- 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
- 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
- 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
- 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
- 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
- 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
- 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
- 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
- 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
- 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
- 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
- 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
- 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
- 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
- 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
- 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
- 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
- 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
- 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
- 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
- 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
- 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
- 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
- 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
- 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
- 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
- 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
- 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
- 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
- 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
- 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
- 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
- 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
- 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
- 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
- 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
- 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
- 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
- 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
- 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
- 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
- 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
- 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
- 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
- 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
- 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
- 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
- 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
- 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
- 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
- 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
- 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
- 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
- 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
- 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
- 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
- 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
- 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
- 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
- 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
- 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
- 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
- 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
- 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
- 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
- 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
- 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
- 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
- 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
- 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
- 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
- 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
- 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
- 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
- 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
- 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
- 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
- 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
- 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
- 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
- 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
- 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
- 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
- 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
- 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
- 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
- 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
- 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
- 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
- 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
- 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
- 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
- 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
- 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
- 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
- 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
- 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
- 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
- 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
- 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
- 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
- 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
- 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
- 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
- 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
- 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
- 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
- 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
- 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
- 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
- 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
- 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
- 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
- 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
- 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
- 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
- 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
- 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
- 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
- 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
- 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
- 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
- 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
- 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
- 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
- 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
- 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
- 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
- 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
- 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
- 835584U, // <0,u,2,3>: Cost 0 copy LHS
- 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
- 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
- 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
- 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
- 835584U, // <0,u,2,u>: Cost 0 copy LHS
- 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
- 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
- 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
- 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
- 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
- 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
- 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
- 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
- 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
- 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
- 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
- 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
- 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
- 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
- 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
- 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
- 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
- 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
- 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
- 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
- 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
- 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
- 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
- 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
- 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
- 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
- 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
- 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
- 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
- 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
- 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
- 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
- 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
- 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
- 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
- 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
- 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
- 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
- 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
- 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
- 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
- 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
- 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
- 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
- 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
- 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
- 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
- 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
- 835584U, // <0,u,u,3>: Cost 0 copy LHS
- 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
- 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
- 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
- 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
- 835584U, // <0,u,u,u>: Cost 0 copy LHS
- 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
- 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
- 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
- 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
- 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
- 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
- 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
- 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
- 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
- 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
- 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
- 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
- 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
- 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
- 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
- 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
- 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
- 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
- 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
- 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
- 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
- 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
- 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
- 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
- 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
- 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
- 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
- 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
- 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
- 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
- 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
- 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
- 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
- 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
- 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
- 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
- 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
- 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
- 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
- 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
- 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
- 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
- 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
- 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
- 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
- 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
- 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
- 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
- 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
- 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
- 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
- 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
- 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
- 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
- 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
- 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
- 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
- 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
- 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
- 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
- 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
- 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
- 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
- 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
- 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
- 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
- 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
- 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
- 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
- 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
- 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
- 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
- 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
- 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
- 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
- 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
- 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
- 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
- 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
- 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
- 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
- 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
- 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
- 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
- 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
- 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
- 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
- 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
- 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
- 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
- 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
- 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
- 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
- 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
- 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
- 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
- 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
- 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
- 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
- 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
- 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
- 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
- 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
- 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
- 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
- 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
- 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
- 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
- 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
- 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
- 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
- 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
- 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
- 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
- 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
- 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
- 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
- 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
- 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
- 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
- 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
- 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
- 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
- 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
- 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
- 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
- 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
- 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
- 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
- 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
- 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
- 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
- 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
- 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
- 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
- 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
- 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
- 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
- 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
- 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
- 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
- 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
- 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
- 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
- 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
- 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
- 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
- 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
- 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
- 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
- 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
- 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
- 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
- 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
- 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
- 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
- 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
- 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
- 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
- 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
- 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
- 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
- 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
- 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
- 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
- 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
- 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
- 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
- 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
- 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
- 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
- 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
- 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
- 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
- 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
- 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
- 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
- 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
- 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
- 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
- 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
- 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
- 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
- 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
- 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
- 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
- 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
- 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
- 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
- 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
- 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
- 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
- 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
- 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
- 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
- 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
- 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
- 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
- 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
- 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
- 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
- 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
- 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
- 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
- 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
- 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
- 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
- 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
- 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
- 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
- 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
- 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
- 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
- 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
- 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
- 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
- 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
- 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
- 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
- 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
- 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
- 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
- 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
- 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
- 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
- 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
- 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
- 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
- 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
- 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
- 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
- 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
- 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
- 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
- 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
- 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
- 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
- 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
- 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
- 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
- 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
- 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
- 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
- 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
- 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
- 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
- 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
- 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
- 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
- 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
- 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
- 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
- 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
- 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
- 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
- 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
- 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
- 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
- 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
- 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
- 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
- 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
- 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
- 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
- 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
- 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
- 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
- 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
- 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
- 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
- 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
- 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
- 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
- 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
- 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
- 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
- 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
- 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
- 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
- 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
- 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
- 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
- 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
- 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
- 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
- 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
- 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
- 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
- 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
- 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
- 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
- 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
- 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
- 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
- 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
- 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
- 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
- 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
- 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
- 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
- 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
- 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
- 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
- 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
- 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
- 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
- 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
- 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
- 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
- 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
- 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
- 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
- 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
- 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
- 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
- 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
- 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
- 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
- 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
- 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
- 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
- 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
- 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
- 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
- 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
- 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
- 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
- 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
- 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
- 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
- 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
- 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
- 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
- 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
- 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
- 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
- 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
- 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
- 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
- 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
- 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
- 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
- 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
- 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
- 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
- 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
- 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
- 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
- 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
- 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
- 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
- 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
- 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
- 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
- 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
- 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
- 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
- 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
- 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
- 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
- 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
- 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
- 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
- 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
- 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
- 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
- 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
- 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
- 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
- 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
- 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
- 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
- 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
- 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
- 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
- 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
- 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
- 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
- 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
- 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
- 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
- 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
- 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
- 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
- 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
- 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
- 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
- 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
- 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
- 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
- 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
- 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
- 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
- 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
- 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
- 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
- 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
- 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
- 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
- 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
- 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
- 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
- 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
- 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
- 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
- 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
- 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
- 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
- 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
- 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
- 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
- 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
- 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
- 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
- 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
- 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
- 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
- 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
- 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
- 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
- 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
- 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
- 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
- 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
- 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
- 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
- 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
- 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
- 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
- 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
- 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
- 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
- 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
- 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
- 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
- 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
- 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
- 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
- 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
- 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
- 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
- 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
- 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
- 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
- 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
- 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
- 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
- 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
- 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
- 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
- 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
- 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
- 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
- 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
- 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
- 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
- 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
- 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
- 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
- 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
- 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
- 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
- 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
- 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
- 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
- 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
- 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
- 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
- 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
- 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
- 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
- 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
- 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
- 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
- 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
- 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
- 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
- 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
- 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
- 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
- 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
- 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
- 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
- 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
- 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
- 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
- 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
- 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
- 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
- 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
- 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
- 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
- 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
- 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
- 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
- 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
- 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
- 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
- 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
- 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
- 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
- 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
- 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
- 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
- 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
- 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
- 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
- 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
- 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
- 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
- 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
- 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
- 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
- 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
- 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
- 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
- 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
- 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
- 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
- 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
- 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
- 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
- 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
- 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
- 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
- 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
- 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
- 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
- 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
- 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
- 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
- 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
- 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
- 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
- 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
- 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
- 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
- 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
- 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
- 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
- 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
- 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
- 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
- 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
- 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
- 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
- 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
- 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
- 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
- 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
- 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
- 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
- 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
- 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
- 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
- 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
- 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
- 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
- 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
- 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
- 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
- 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
- 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
- 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
- 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
- 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
- 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
- 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
- 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
- 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
- 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
- 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
- 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
- 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
- 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
- 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
- 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
- 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
- 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
- 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
- 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
- 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
- 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
- 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
- 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
- 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
- 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
- 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
- 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
- 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
- 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
- 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
- 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
- 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
- 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
- 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
- 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
- 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
- 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
- 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
- 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
- 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
- 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
- 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
- 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
- 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
- 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
- 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
- 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
- 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
- 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
- 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
- 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
- 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
- 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
- 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
- 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
- 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
- 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
- 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
- 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
- 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
- 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
- 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
- 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
- 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
- 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
- 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
- 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
- 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
- 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
- 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
- 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
- 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
- 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
- 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
- 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
- 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
- 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
- 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
- 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
- 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
- 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
- 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
- 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
- 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
- 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
- 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
- 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
- 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
- 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
- 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
- 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
- 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
- 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
- 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
- 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
- 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
- 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
- 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
- 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
- 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
- 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
- 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
- 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
- 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
- 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
- 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
- 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
- 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
- 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
- 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
- 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
- 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
- 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
- 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
- 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
- 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
- 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
- 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
- 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
- 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
- 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
- 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
- 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
- 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
- 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
- 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
- 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
- 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
- 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
- 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
- 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
- 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
- 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
- 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
- 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
- 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
- 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
- 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
- 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
- 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
- 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
- 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
- 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
- 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
- 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
- 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
- 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
- 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
- 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
- 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
- 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
- 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
- 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
- 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
- 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
- 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
- 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
- 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
- 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
- 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
- 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
- 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
- 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
- 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
- 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
- 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
- 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
- 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
- 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
- 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
- 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
- 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
- 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
- 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
- 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
- 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
- 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
- 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
- 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
- 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
- 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
- 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
- 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
- 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
- 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
- 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
- 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
- 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
- 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
- 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
- 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
- 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
- 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
- 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
- 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
- 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
- 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
- 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
- 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
- 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
- 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
- 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
- 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
- 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
- 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
- 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
- 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
- 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
- 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
- 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
- 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
- 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
- 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
- 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
- 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
- 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
- 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
- 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
- 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
- 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
- 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
- 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
- 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
- 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
- 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
- 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
- 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
- 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
- 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
- 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
- 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
- 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
- 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
- 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
- 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
- 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
- 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
- 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
- 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
- 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
- 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
- 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
- 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
- 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
- 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
- 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
- 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
- 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
- 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
- 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
- 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
- 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
- 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
- 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
- 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
- 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
- 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
- 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
- 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
- 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
- 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
- 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
- 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
- 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
- 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
- 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
- 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
- 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
- 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
- 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
- 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
- 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
- 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
- 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
- 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
- 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
- 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
- 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
- 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
- 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
- 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
- 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
- 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
- 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
- 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
- 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
- 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
- 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
- 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
- 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
- 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
- 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
- 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
- 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
- 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
- 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
- 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
- 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
- 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
- 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
- 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
- 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
- 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
- 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
- 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
- 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
- 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
- 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
- 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
- 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
- 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
- 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
- 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
- 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
- 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
- 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
- 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
- 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
- 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
- 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
- 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
- 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
- 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
- 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
- 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
- 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
- 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
- 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
- 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
- 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
- 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
- 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
- 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
- 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
- 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
- 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
- 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
- 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
- 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
- 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
- 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
- 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
- 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
- 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
- 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
- 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
- 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
- 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
- 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
- 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
- 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
- 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
- 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
- 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
- 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
- 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
- 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
- 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
- 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
- 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
- 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
- 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
- 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
- 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
- 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
- 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
- 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
- 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
- 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
- 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
- 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
- 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
- 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
- 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
- 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
- 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
- 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
- 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
- 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
- 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
- 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
- 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
- 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
- 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
- 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
- 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
- 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
- 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
- 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
- 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
- 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
- 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
- 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
- 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
- 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
- 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
- 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
- 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
- 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
- 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
- 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
- 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
- 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
- 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
- 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
- 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
- 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
- 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
- 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
- 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
- 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
- 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
- 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
- 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
- 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
- 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
- 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
- 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
- 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
- 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
- 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
- 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
- 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
- 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
- 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
- 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
- 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
- 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
- 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
- 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
- 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
- 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
- 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
- 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
- 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
- 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
- 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
- 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
- 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
- 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
- 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
- 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
- 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
- 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
- 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
- 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
- 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
- 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
- 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
- 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
- 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
- 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
- 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
- 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
- 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
- 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
- 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
- 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
- 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
- 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
- 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
- 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
- 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
- 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
- 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
- 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
- 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
- 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
- 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
- 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
- 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
- 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
- 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
- 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
- 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
- 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
- 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
- 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
- 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
- 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
- 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
- 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
- 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
- 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
- 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
- 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
- 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
- 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
- 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
- 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
- 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
- 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
- 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
- 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
- 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
- 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
- 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
- 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
- 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
- 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
- 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
- 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
- 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
- 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
- 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
- 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
- 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
- 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
- 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
- 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
- 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
- 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
- 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
- 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
- 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
- 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
- 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
- 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
- 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
- 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
- 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
- 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
- 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
- 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
- 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
- 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
- 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
- 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
- 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
- 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
- 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
- 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
- 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
- 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
- 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
- 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
- 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
- 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
- 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
- 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
- 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
- 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
- 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
- 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
- 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
- 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
- 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
- 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
- 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
- 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
- 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
- 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
- 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
- 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
- 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
- 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
- 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
- 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
- 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
- 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
- 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
- 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
- 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
- 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
- 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
- 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
- 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
- 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
- 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
- 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
- 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
- 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
- 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
- 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
- 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
- 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
- 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
- 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
- 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
- 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
- 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
- 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
- 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
- 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
- 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
- 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
- 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
- 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
- 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
- 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
- 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
- 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
- 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
- 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
- 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
- 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
- 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
- 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
- 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
- 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
- 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
- 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
- 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
- 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
- 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
- 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
- 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
- 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
- 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
- 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
- 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
- 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
- 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
- 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
- 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
- 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
- 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
- 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
- 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
- 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
- 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
- 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
- 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
- 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
- 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
- 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
- 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
- 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
- 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
- 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
- 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
- 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
- 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
- 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
- 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
- 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
- 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
- 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
- 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
- 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
- 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
- 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
- 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
- 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
- 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
- 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
- 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
- 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
- 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
- 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
- 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
- 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
- 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
- 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
- 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
- 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
- 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
- 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
- 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
- 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
- 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
- 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
- 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
- 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
- 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
- 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
- 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
- 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
- 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
- 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
- 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
- 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
- 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
- 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
- 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
- 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
- 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
- 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
- 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
- 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
- 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
- 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
- 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
- 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
- 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
- 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
- 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
- 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
- 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
- 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
- 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
- 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
- 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
- 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
- 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
- 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
- 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
- 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
- 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
- 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
- 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
- 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
- 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
- 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
- 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
- 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
- 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
- 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
- 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
- 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
- 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
- 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
- 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
- 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
- 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
- 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
- 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
- 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
- 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
- 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
- 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
- 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
- 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
- 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
- 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
- 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
- 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
- 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
- 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
- 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
- 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
- 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
- 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
- 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
- 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
- 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
- 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
- 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
- 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
- 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
- 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
- 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
- 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
- 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
- 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
- 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
- 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
- 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
- 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
- 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
- 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
- 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
- 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
- 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
- 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
- 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
- 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
- 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
- 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
- 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
- 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
- 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
- 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
- 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
- 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
- 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
- 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
- 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
- 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
- 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
- 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
- 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
- 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
- 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
- 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
- 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
- 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
- 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
- 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
- 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
- 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
- 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
- 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
- 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
- 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
- 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
- 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
- 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
- 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
- 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
- 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
- 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
- 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
- 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
- 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
- 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
- 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
- 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
- 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
- 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
- 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
- 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
- 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
- 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
- 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
- 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
- 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
- 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
- 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
- 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
- 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
- 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
- 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
- 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
- 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
- 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
- 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
- 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
- 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
- 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
- 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
- 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
- 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
- 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
- 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
- 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
- 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
- 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
- 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
- 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
- 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
- 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
- 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
- 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
- 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
- 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
- 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
- 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
- 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
- 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
- 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
- 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
- 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
- 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
- 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
- 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
- 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
- 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
- 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
- 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
- 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
- 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
- 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
- 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
- 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
- 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
- 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
- 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
- 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
- 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
- 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
- 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
- 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
- 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
- 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
- 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
- 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
- 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
- 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
- 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
- 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
- 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
- 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
- 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
- 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
- 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
- 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
- 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
- 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
- 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
- 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
- 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
- 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
- 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
- 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
- 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
- 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
- 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
- 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
- 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
- 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
- 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
- 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
- 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
- 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
- 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
- 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
- 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
- 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
- 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
- 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
- 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
- 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
- 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
- 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
- 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
- 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
- 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
- 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
- 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
- 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
- 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
- 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
- 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
- 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
- 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
- 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
- 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
- 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
- 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
- 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
- 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
- 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
- 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
- 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
- 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
- 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
- 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
- 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
- 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
- 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
- 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
- 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
- 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
- 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
- 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
- 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
- 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
- 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
- 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
- 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
- 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
- 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
- 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
- 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
- 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
- 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
- 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
- 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
- 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
- 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
- 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
- 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
- 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
- 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
- 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
- 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
- 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
- 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
- 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
- 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
- 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
- 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
- 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
- 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
- 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
- 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
- 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
- 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
- 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
- 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
- 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
- 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
- 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
- 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
- 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
- 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
- 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
- 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
- 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
- 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
- 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
- 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
- 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
- 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
- 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
- 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
- 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
- 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
- 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
- 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
- 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
- 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
- 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
- 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
- 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
- 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
- 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
- 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
- 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
- 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
- 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
- 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
- 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
- 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
- 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
- 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
- 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
- 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
- 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
- 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
- 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
- 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
- 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
- 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
- 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
- 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
- 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
- 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
- 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
- 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
- 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
- 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
- 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
- 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
- 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
- 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
- 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
- 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
- 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
- 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
- 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
- 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
- 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
- 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
- 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
- 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
- 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
- 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
- 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
- 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
- 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
- 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
- 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
- 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
- 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
- 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
- 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
- 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
- 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
- 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
- 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
- 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
- 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
- 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
- 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
- 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
- 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
- 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
- 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
- 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
- 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
- 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
- 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
- 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
- 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
- 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
- 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
- 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
- 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
- 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
- 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
- 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
- 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
- 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
- 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
- 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
- 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
- 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
- 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
- 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
- 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
- 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
- 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
- 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
- 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
- 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
- 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
- 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
- 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
- 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
- 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
- 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
- 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
- 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
- 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
- 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
- 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
- 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
- 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
- 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
- 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
- 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
- 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
- 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
- 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
- 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
- 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
- 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
- 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
- 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
- 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
- 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
- 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
- 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
- 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
- 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
- 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
- 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
- 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
- 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
- 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
- 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
- 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
- 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
- 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
- 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
- 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
- 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
- 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
- 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
- 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
- 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
- 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
- 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
- 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
- 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
- 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
- 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
- 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
- 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
- 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
- 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
- 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
- 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
- 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
- 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
- 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
- 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
- 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
- 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
- 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
- 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
- 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
- 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
- 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
- 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
- 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
- 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
- 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
- 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
- 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
- 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
- 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
- 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
- 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
- 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
- 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
- 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
- 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
- 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
- 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
- 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
- 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
- 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
- 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
- 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
- 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
- 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
- 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
- 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
- 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
- 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
- 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
- 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
- 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
- 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
- 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
- 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
- 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
- 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
- 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
- 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
- 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
- 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
- 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
- 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
- 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
- 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
- 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
- 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
- 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
- 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
- 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
- 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
- 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
- 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
- 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
- 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
- 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
- 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
- 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
- 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
- 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
- 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
- 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
- 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
- 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
- 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
- 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
- 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
- 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
- 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
- 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
- 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
- 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
- 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
- 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
- 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
- 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
- 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
- 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
- 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
- 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
- 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
- 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
- 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
- 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
- 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
- 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
- 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
- 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
- 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
- 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
- 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
- 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
- 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
- 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
- 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
- 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
- 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
- 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
- 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
- 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
- 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
- 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
- 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
- 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
- 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
- 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
- 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
- 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
- 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
- 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
- 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
- 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
- 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
- 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
- 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
- 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
- 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
- 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
- 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
- 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
- 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
- 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
- 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
- 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
- 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
- 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
- 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
- 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
- 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
- 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
- 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
- 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
- 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
- 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
- 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
- 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
- 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
- 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
- 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
- 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
- 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
- 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
- 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
- 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
- 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
- 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
- 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
- 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
- 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
- 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
- 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
- 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
- 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
- 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
- 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
- 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
- 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
- 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
- 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
- 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
- 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
- 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
- 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
- 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
- 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
- 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
- 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
- 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
- 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
- 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
- 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
- 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
- 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
- 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
- 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
- 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
- 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
- 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
- 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
- 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
- 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
- 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
- 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
- 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
- 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
- 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
- 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
- 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
- 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
- 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
- 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
- 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
- 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
- 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
- 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
- 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
- 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
- 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
- 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
- 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
- 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
- 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
- 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
- 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
- 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
- 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
- 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
- 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
- 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
- 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
- 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
- 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
- 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
- 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
- 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
- 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
- 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
- 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
- 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
- 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
- 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
- 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
- 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
- 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
- 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
- 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
- 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
- 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
- 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
- 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
- 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
- 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
- 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
- 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
- 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
- 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
- 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
- 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
- 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
- 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
- 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
- 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
- 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
- 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
- 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
- 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
- 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
- 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
- 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
- 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
- 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
- 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
- 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
- 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
- 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
- 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
- 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
- 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
- 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
- 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
- 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
- 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
- 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
- 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
- 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
- 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
- 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
- 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
- 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
- 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
- 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
- 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
- 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
- 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
- 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
- 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
- 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
- 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
- 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
- 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
- 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
- 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
- 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
- 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
- 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
- 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
- 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
- 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
- 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
- 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
- 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
- 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
- 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
- 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
- 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
- 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
- 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
- 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
- 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
- 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
- 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
- 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
- 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
- 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
- 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
- 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
- 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
- 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
- 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
- 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
- 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
- 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
- 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
- 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
- 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
- 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
- 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
- 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
- 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
- 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
- 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
- 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
- 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
- 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
- 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
- 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
- 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
- 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
- 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
- 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
- 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
- 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
- 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
- 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
- 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
- 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
- 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
- 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
- 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
- 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
- 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
- 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
- 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
- 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
- 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
- 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
- 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
- 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
- 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
- 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
- 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
- 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
- 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
- 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
- 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
- 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
- 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
- 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
- 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
- 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
- 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
- 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
- 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
- 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
- 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
- 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
- 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
- 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
- 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
- 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
- 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
- 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
- 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
- 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
- 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
- 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
- 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
- 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
- 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
- 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
- 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
- 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
- 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
- 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
- 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
- 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
- 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
- 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
- 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
- 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
- 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
- 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
- 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
- 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
- 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
- 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
- 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
- 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
- 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
- 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
- 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
- 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
- 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
- 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
- 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
- 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
- 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
- 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
- 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
- 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
- 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
- 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
- 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
- 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
- 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
- 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
- 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
- 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
- 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
- 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
- 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
- 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
- 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
- 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
- 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
- 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
- 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
- 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
- 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
- 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
- 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
- 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
- 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
- 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
- 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
- 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
- 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
- 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
- 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
- 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
- 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
- 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
- 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
- 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
- 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
- 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
- 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
- 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
- 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
- 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
- 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
- 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
- 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
- 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
- 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
- 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
- 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
- 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
- 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
- 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
- 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
- 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
- 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
- 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
- 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
- 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
- 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
- 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
- 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
- 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
- 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
- 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
- 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
- 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
- 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
- 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
- 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
- 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
- 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
- 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
- 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
- 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
- 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
- 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
- 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
- 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
- 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
- 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
- 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
- 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
- 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
- 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
- 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
- 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
- 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
- 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
- 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
- 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
- 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
- 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
- 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
- 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
- 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
- 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
- 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
- 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
- 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
- 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
- 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
- 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
- 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
- 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
- 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
- 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
- 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
- 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
- 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
- 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
- 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
- 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
- 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
- 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
- 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
- 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
- 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
- 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
- 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
- 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
- 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
- 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
- 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
- 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
- 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
- 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
- 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
- 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
- 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
- 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
- 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
- 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
- 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
- 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
- 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
- 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
- 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
- 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
- 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
- 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
- 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
- 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
- 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
- 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
- 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
- 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
- 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
- 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
- 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
- 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
- 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
- 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
- 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
- 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
- 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
- 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
- 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
- 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
- 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
- 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
- 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
- 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
- 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
- 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
- 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
- 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
- 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
- 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
- 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
- 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
- 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
- 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
- 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
- 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
- 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
- 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
- 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
- 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
- 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
- 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
- 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
- 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
- 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
- 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
- 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
- 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
- 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
- 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
- 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
- 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
- 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
- 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
- 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
- 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
- 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
- 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
- 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
- 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
- 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
- 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
- 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
- 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
- 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
- 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
- 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
- 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
- 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
- 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
- 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
- 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
- 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
- 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
- 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
- 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
- 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
- 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
- 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
- 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
- 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
- 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
- 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
- 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
- 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
- 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
- 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
- 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
- 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
- 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
- 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
- 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
- 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
- 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
- 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
- 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
- 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
- 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
- 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
- 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
- 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
- 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
- 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
- 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
- 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
- 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
- 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
- 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
- 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
- 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
- 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
- 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
- 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
- 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
- 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
- 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
- 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
- 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
- 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
- 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
- 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
- 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
- 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
- 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
- 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
- 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
- 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
- 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
- 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
- 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
- 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
- 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
- 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
- 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
- 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
- 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
- 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
- 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
- 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
- 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
- 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
- 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
- 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
- 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
- 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
- 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
- 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
- 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
- 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
- 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
- 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
- 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
- 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
- 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
- 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
- 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
- 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
- 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
- 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
- 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
- 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
- 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
- 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
- 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
- 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
- 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
- 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
- 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
- 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
- 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
- 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
- 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
- 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
- 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
- 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
- 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
- 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
- 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
- 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
- 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
- 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
- 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
- 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
- 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
- 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
- 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
- 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
- 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
- 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
- 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
- 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
- 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
- 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
- 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
- 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
- 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
- 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
- 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
- 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
- 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
- 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
- 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
- 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
- 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
- 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
- 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
- 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
- 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
- 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
- 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
- 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
- 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
- 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
- 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
- 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
- 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
- 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
- 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
- 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
- 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
- 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
- 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
- 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
- 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
- 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
- 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
- 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
- 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
- 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
- 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
- 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
- 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
- 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
- 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
- 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
- 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
- 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
- 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
- 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
- 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
- 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
- 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
- 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
- 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
- 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
- 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
- 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
- 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
- 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
- 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
- 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
- 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
- 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
- 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
- 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
- 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
- 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
- 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
- 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
- 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
- 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
- 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
- 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
- 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
- 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
- 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
- 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
- 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
- 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
- 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
- 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
- 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
- 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
- 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
- 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
- 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
- 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
- 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
- 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
- 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
- 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
- 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
- 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
- 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
- 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
- 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
- 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
- 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
- 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
- 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
- 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
- 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
- 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
- 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
- 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
- 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
- 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
- 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
- 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
- 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
- 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
- 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
- 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
- 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
- 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
- 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
- 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
- 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
- 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
- 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
- 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
- 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
- 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
- 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
- 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
- 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
- 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
- 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
- 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
- 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
- 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
- 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
- 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
- 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
- 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
- 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
- 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
- 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
- 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
- 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
- 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
- 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
- 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
- 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
- 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
- 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
- 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
- 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
- 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
- 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
- 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
- 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
- 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
- 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
- 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
- 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
- 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
- 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
- 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
- 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
- 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
- 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
- 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
- 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
- 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
- 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
- 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
- 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
- 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
- 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
- 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
- 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
- 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
- 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
- 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
- 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
- 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
- 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
- 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
- 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
- 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
- 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
- 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
- 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
- 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
- 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
- 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
- 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
- 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
- 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
- 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
- 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
- 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
- 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
- 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
- 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
- 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
- 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
- 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
- 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
- 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
- 27705344U, // <4,5,6,7>: Cost 0 copy RHS
- 27705344U, // <4,5,6,u>: Cost 0 copy RHS
- 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
- 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
- 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
- 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
- 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
- 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
- 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
- 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
- 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
- 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
- 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
- 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
- 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
- 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
- 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
- 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
- 27705344U, // <4,5,u,7>: Cost 0 copy RHS
- 27705344U, // <4,5,u,u>: Cost 0 copy RHS
- 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
- 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
- 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
- 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
- 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
- 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
- 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
- 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
- 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
- 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
- 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
- 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
- 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
- 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
- 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
- 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
- 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
- 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
- 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
- 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
- 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
- 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
- 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
- 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
- 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
- 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
- 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
- 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
- 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
- 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
- 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
- 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
- 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
- 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
- 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
- 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
- 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
- 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
- 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
- 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
- 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
- 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
- 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
- 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
- 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
- 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
- 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
- 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
- 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
- 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
- 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
- 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
- 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
- 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
- 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
- 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
- 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
- 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
- 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
- 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
- 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
- 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
- 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
- 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
- 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
- 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
- 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
- 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
- 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
- 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
- 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
- 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
- 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
- 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
- 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
- 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
- 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
- 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
- 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
- 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
- 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
- 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
- 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
- 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
- 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
- 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
- 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
- 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
- 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
- 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
- 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
- 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
- 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
- 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
- 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
- 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
- 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
- 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
- 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
- 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
- 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
- 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
- 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
- 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
- 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
- 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
- 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
- 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
- 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
- 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
- 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
- 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
- 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
- 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
- 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
- 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
- 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
- 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
- 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
- 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
- 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
- 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
- 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
- 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
- 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
- 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
- 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
- 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
- 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
- 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
- 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
- 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
- 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
- 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
- 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
- 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
- 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
- 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
- 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
- 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
- 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
- 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
- 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
- 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
- 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
- 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
- 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
- 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
- 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
- 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
- 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
- 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
- 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
- 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
- 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
- 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
- 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
- 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
- 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
- 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
- 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
- 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
- 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
- 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
- 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
- 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
- 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
- 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
- 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
- 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
- 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
- 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
- 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
- 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
- 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
- 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
- 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
- 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
- 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
- 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
- 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
- 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
- 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
- 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
- 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
- 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
- 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
- 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
- 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
- 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
- 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
- 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
- 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
- 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
- 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
- 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
- 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
- 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
- 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
- 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
- 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
- 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
- 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
- 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
- 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
- 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
- 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
- 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
- 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
- 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
- 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
- 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
- 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
- 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
- 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
- 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
- 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
- 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
- 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
- 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
- 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
- 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
- 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
- 27705344U, // <4,u,6,7>: Cost 0 copy RHS
- 27705344U, // <4,u,6,u>: Cost 0 copy RHS
- 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
- 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
- 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
- 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
- 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
- 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
- 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
- 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
- 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
- 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
- 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
- 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
- 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
- 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
- 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
- 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
- 27705344U, // <4,u,u,7>: Cost 0 copy RHS
- 27705344U, // <4,u,u,u>: Cost 0 copy RHS
- 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
- 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
- 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
- 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
- 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
- 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
- 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
- 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
- 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
- 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
- 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
- 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
- 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
- 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
- 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
- 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
- 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
- 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
- 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
- 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
- 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
- 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
- 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
- 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
- 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
- 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
- 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
- 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
- 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
- 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
- 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
- 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
- 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
- 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
- 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
- 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
- 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
- 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
- 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
- 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
- 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
- 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
- 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
- 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
- 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
- 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
- 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
- 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
- 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
- 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
- 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
- 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
- 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
- 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
- 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
- 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
- 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
- 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
- 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
- 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
- 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
- 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
- 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
- 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
- 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
- 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
- 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
- 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
- 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
- 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
- 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
- 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
- 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
- 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
- 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
- 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
- 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
- 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
- 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
- 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
- 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
- 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
- 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
- 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
- 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
- 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
- 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
- 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
- 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
- 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
- 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
- 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
- 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
- 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
- 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
- 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
- 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
- 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
- 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
- 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
- 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
- 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
- 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
- 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
- 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
- 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
- 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
- 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
- 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
- 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
- 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
- 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
- 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
- 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
- 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
- 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
- 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
- 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
- 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
- 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
- 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
- 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
- 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
- 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
- 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
- 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
- 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
- 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
- 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
- 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
- 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
- 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
- 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
- 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
- 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
- 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
- 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
- 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
- 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
- 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
- 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
- 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
- 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
- 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
- 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
- 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
- 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
- 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
- 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
- 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
- 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
- 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
- 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
- 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
- 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
- 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
- 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
- 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
- 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
- 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
- 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
- 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
- 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
- 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
- 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
- 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
- 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
- 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
- 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
- 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
- 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
- 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
- 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
- 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
- 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
- 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
- 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
- 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
- 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
- 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
- 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
- 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
- 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
- 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
- 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
- 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
- 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
- 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
- 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
- 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
- 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
- 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
- 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
- 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
- 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
- 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
- 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
- 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
- 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
- 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
- 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
- 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
- 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
- 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
- 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
- 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
- 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
- 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
- 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
- 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
- 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
- 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
- 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
- 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
- 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
- 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
- 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
- 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
- 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
- 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
- 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
- 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
- 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
- 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
- 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
- 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
- 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
- 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
- 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
- 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
- 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
- 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
- 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
- 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
- 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
- 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
- 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
- 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
- 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
- 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
- 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
- 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
- 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
- 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
- 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
- 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
- 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
- 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
- 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
- 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
- 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
- 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
- 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
- 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
- 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
- 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
- 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
- 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
- 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
- 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
- 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
- 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
- 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
- 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
- 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
- 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
- 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
- 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
- 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
- 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
- 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
- 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
- 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
- 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
- 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
- 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
- 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
- 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
- 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
- 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
- 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
- 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
- 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
- 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
- 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
- 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
- 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
- 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
- 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
- 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
- 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
- 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
- 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
- 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
- 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
- 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
- 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
- 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
- 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
- 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
- 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
- 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
- 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
- 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
- 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
- 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
- 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
- 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
- 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
- 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
- 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
- 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
- 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
- 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
- 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
- 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
- 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
- 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
- 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
- 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
- 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
- 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
- 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
- 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
- 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
- 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
- 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
- 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
- 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
- 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
- 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
- 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
- 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
- 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
- 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
- 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
- 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
- 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
- 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
- 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
- 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
- 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
- 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
- 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
- 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
- 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
- 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
- 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
- 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
- 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
- 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
- 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
- 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
- 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
- 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
- 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
- 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
- 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
- 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
- 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
- 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
- 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
- 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
- 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
- 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
- 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
- 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
- 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
- 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
- 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
- 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
- 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
- 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
- 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
- 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
- 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
- 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
- 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
- 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
- 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
- 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
- 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
- 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
- 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
- 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
- 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
- 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
- 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
- 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
- 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
- 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
- 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
- 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
- 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
- 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
- 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
- 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
- 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
- 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
- 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
- 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
- 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
- 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
- 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
- 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
- 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
- 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
- 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
- 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
- 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
- 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
- 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
- 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
- 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
- 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
- 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
- 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
- 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
- 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
- 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
- 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
- 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
- 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
- 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
- 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
- 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
- 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
- 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
- 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
- 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
- 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
- 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
- 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
- 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
- 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
- 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
- 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
- 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
- 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
- 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
- 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
- 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
- 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
- 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
- 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
- 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
- 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
- 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
- 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
- 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
- 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
- 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
- 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
- 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
- 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
- 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
- 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
- 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
- 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
- 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
- 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
- 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
- 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
- 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
- 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
- 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
- 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
- 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
- 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
- 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
- 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
- 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
- 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
- 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
- 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
- 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
- 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
- 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
- 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
- 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
- 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
- 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
- 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
- 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
- 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
- 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
- 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
- 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
- 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
- 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
- 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
- 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
- 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
- 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
- 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
- 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
- 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
- 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
- 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
- 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
- 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
- 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
- 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
- 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
- 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
- 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
- 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
- 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
- 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
- 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
- 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
- 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
- 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
- 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
- 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
- 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
- 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
- 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
- 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
- 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
- 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
- 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
- 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
- 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
- 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
- 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
- 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
- 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
- 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
- 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
- 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
- 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
- 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
- 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
- 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
- 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
- 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
- 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
- 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
- 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
- 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
- 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
- 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
- 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
- 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
- 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
- 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
- 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
- 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
- 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
- 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
- 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
- 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
- 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
- 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
- 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
- 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
- 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
- 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
- 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
- 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
- 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
- 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
- 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
- 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
- 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
- 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
- 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
- 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
- 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
- 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
- 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
- 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
- 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
- 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
- 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
- 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
- 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
- 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
- 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
- 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
- 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
- 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
- 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
- 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
- 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
- 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
- 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
- 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
- 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
- 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
- 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
- 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
- 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
- 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
- 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
- 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
- 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
- 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
- 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
- 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
- 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
- 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
- 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
- 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
- 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
- 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
- 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
- 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
- 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
- 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
- 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
- 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
- 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
- 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
- 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
- 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
- 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
- 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
- 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
- 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
- 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
- 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
- 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
- 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
- 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
- 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
- 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
- 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
- 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
- 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
- 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
- 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
- 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
- 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
- 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
- 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
- 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
- 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
- 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
- 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
- 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
- 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
- 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
- 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
- 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
- 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
- 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
- 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
- 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
- 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
- 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
- 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
- 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
- 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
- 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
- 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
- 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
- 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
- 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
- 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
- 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
- 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
- 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
- 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
- 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
- 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
- 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
- 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
- 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
- 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
- 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
- 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
- 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
- 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
- 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
- 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
- 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
- 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
- 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
- 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
- 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
- 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
- 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
- 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
- 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
- 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
- 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
- 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
- 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
- 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
- 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
- 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
- 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
- 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
- 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
- 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
- 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
- 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
- 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
- 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
- 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
- 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
- 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
- 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
- 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
- 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
- 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
- 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
- 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
- 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
- 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
- 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
- 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
- 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
- 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
- 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
- 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
- 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
- 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
- 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
- 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
- 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
- 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
- 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
- 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
- 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
- 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
- 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
- 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
- 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
- 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
- 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
- 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
- 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
- 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
- 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
- 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
- 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
- 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
- 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
- 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
- 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
- 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
- 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
- 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
- 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
- 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
- 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
- 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
- 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
- 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
- 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
- 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
- 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
- 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
- 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
- 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
- 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
- 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
- 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
- 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
- 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
- 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
- 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
- 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
- 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
- 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
- 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
- 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
- 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
- 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
- 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
- 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
- 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
- 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
- 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
- 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
- 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
- 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
- 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
- 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
- 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
- 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
- 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
- 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
- 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
- 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
- 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
- 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
- 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
- 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
- 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
- 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
- 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
- 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
- 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
- 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
- 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
- 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
- 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
- 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
- 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
- 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
- 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
- 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
- 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
- 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
- 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
- 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
- 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
- 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
- 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
- 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
- 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
- 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
- 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
- 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
- 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
- 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
- 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
- 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
- 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
- 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
- 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
- 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
- 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
- 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
- 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
- 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
- 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
- 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
- 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
- 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
- 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
- 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
- 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
- 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
- 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
- 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
- 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
- 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
- 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
- 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
- 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
- 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
- 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
- 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
- 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
- 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
- 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
- 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
- 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
- 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
- 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
- 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
- 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
- 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
- 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
- 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
- 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
- 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
- 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
- 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
- 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
- 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
- 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
- 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
- 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
- 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
- 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
- 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
- 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
- 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
- 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
- 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
- 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
- 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
- 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
- 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
- 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
- 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
- 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
- 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
- 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
- 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
- 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
- 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
- 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
- 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
- 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
- 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
- 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
- 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
- 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
- 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
- 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
- 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
- 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
- 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
- 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
- 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
- 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
- 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
- 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
- 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
- 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
- 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
- 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
- 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
- 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
- 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
- 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
- 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
- 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
- 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
- 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
- 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
- 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
- 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
- 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
- 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
- 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
- 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
- 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
- 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
- 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
- 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
- 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
- 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
- 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
- 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
- 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
- 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
- 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
- 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
- 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
- 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
- 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
- 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
- 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
- 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
- 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
- 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
- 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
- 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
- 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
- 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
- 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
- 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
- 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
- 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
- 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
- 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
- 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
- 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
- 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
- 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
- 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
- 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
- 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
- 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
- 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
- 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
- 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
- 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
- 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
- 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
- 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
- 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
- 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
- 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
- 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
- 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
- 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
- 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
- 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
- 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
- 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
- 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
- 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
- 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
- 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
- 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
- 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
- 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
- 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
- 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
- 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
- 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
- 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
- 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
- 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
- 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
- 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
- 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
- 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
- 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
- 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
- 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
- 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
- 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
- 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
- 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
- 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
- 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
- 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
- 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
- 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
- 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
- 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
- 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
- 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
- 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
- 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
- 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
- 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
- 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
- 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
- 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
- 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
- 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
- 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
- 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
- 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
- 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
- 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
- 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
- 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
- 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
- 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
- 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
- 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
- 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
- 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
- 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
- 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
- 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
- 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
- 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
- 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
- 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
- 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
- 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
- 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
- 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
- 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
- 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
- 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
- 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
- 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
- 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
- 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
- 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
- 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
- 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
- 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
- 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
- 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
- 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
- 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
- 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
- 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
- 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
- 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
- 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
- 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
- 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
- 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
- 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
- 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
- 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
- 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
- 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
- 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
- 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
- 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
- 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
- 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
- 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
- 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
- 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
- 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
- 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
- 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
- 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
- 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
- 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
- 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
- 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
- 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
- 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
- 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
- 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
- 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
- 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
- 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
- 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
- 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
- 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
- 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
- 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
- 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
- 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
- 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
- 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
- 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
- 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
- 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
- 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
- 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
- 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
- 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
- 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
- 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
- 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
- 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
- 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
- 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
- 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
- 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
- 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
- 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
- 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
- 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
- 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
- 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
- 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
- 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
- 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
- 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
- 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
- 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
- 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
- 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
- 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
- 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
- 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
- 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
- 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
- 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
- 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
- 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
- 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
- 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
- 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
- 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
- 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
- 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
- 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
- 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
- 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
- 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
- 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
- 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
- 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
- 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
- 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
- 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
- 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
- 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
- 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
- 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
- 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
- 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
- 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
- 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
- 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
- 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
- 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
- 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
- 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
- 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
- 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
- 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
- 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
- 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
- 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
- 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
- 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
- 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
- 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
- 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
- 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
- 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
- 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
- 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
- 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
- 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
- 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
- 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
- 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
- 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
- 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
- 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
- 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
- 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
- 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
- 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
- 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
- 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
- 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
- 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
- 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
- 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
- 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
- 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
- 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
- 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
- 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
- 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
- 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
- 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
- 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
- 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
- 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
- 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
- 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
- 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
- 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
- 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
- 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
- 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
- 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
- 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
- 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
- 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
- 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
- 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
- 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
- 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
- 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
- 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
- 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
- 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
- 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
- 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
- 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
- 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
- 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
- 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
- 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
- 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
- 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
- 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
- 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
- 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
- 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
- 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
- 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
- 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
- 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
- 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
- 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
- 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
- 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
- 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
- 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
- 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
- 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
- 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
- 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
- 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
- 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
- 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
- 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
- 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
- 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
- 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
- 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
- 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
- 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
- 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
- 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
- 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
- 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
- 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
- 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
- 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
- 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
- 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
- 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
- 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
- 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
- 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
- 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
- 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
- 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
- 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
- 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
- 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
- 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
- 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
- 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
- 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
- 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
- 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
- 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
- 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
- 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
- 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
- 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
- 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
- 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
- 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
- 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
- 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
- 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
- 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
- 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
- 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
- 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
- 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
- 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
- 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
- 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
- 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
- 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
- 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
- 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
- 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
- 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
- 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
- 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
- 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
- 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
- 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
- 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
- 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
- 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
- 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
- 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
- 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
- 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
- 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
- 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
- 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
- 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
- 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
- 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
- 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
- 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
- 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
- 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
- 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
- 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
- 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
- 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
- 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
- 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
- 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
- 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
- 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
- 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
- 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
- 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
- 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
- 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
- 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
- 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
- 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
- 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
- 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
- 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
- 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
- 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
- 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
- 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
- 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
- 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
- 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
- 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
- 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
- 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
- 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
- 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
- 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
- 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
- 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
- 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
- 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
- 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
- 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
- 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
- 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
- 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
- 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
- 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
- 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
- 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
- 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
- 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
- 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
- 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
- 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
- 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
- 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
- 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
- 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
- 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
- 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
- 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
- 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
- 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
- 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
- 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
- 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
- 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
- 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
- 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
- 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
- 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
- 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
- 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
- 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
- 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
- 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
- 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
- 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
- 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
- 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
- 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
- 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
- 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
- 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
- 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
- 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
- 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
- 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
- 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
- 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
- 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
- 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
- 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
- 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
- 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
- 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
- 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
- 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
- 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
- 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
- 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
- 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
- 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
- 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
- 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
- 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
- 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
- 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
- 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
- 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
- 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
- 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
- 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
- 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
- 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
- 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
- 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
- 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
- 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
- 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
- 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
- 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
- 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
- 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
- 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
- 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
- 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
- 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
- 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
- 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
- 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
- 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
- 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
- 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
- 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
- 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
- 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
- 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
- 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
- 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
- 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
- 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
- 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
- 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
- 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
- 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
- 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
- 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
- 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
- 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
- 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
- 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
- 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
- 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
- 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
- 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
- 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
- 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
- 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
- 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
- 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
- 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
- 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
- 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
- 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
- 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
- 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
- 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
- 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
- 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
- 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
- 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
- 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
- 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
- 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
- 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
- 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
- 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
- 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
- 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
- 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
- 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
- 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
- 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
- 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
- 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
- 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
- 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
- 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
- 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
- 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
- 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
- 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
- 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
- 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
- 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
- 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
- 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
- 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
- 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
- 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
- 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
- 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
- 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
- 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
- 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
- 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
- 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
- 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
- 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
- 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
- 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
- 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
- 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
- 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
- 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
- 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
- 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
- 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
- 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
- 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
- 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
- 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
- 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
- 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
- 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
- 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
- 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
- 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
- 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
- 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
- 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
- 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
- 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
- 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
- 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
- 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
- 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
- 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
- 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
- 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
- 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
- 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
- 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
- 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
- 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
- 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
- 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
- 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
- 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
- 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
- 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
- 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
- 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
- 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
- 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
- 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
- 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
- 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
- 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
- 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
- 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
- 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
- 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
- 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
- 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
- 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
- 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
- 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
- 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
- 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
- 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
- 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
- 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
- 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
- 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
- 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
- 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
- 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
- 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
- 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
- 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
- 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
- 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
- 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
- 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
- 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
- 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
- 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
- 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
- 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
- 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
- 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
- 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
- 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
- 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
- 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
- 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
- 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
- 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
- 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
- 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
- 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
- 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
- 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
- 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
- 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
- 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
- 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
- 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
- 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
- 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
- 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
- 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
- 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
- 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
- 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
- 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
- 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
- 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
- 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
- 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
- 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
- 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
- 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
- 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
- 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
- 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
- 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
- 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
- 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
- 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
- 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
- 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
- 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
- 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
- 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
- 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
- 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
- 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
- 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
- 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
- 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
- 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
- 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
- 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
- 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
- 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
- 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
- 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
- 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
- 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
- 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
- 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
- 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
- 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
- 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
- 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
- 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
- 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
- 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
- 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
- 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
- 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
- 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
- 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
- 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
- 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
- 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
- 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
- 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
- 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
- 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
- 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
- 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
- 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
- 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
- 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
- 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
- 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
- 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
- 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
- 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
- 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
- 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
- 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
- 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
- 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
- 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
- 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
- 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
- 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
- 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
- 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
- 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
- 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
- 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
- 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
- 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
- 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
- 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
- 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
- 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
- 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
- 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
- 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
- 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
- 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
- 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
- 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
- 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
- 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
- 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
- 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
- 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
- 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
- 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
- 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
- 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
- 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
- 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
- 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
- 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
- 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
- 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
- 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
- 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
- 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
- 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
- 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
- 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
- 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
- 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
- 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
- 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
- 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
- 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
- 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
- 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
- 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
- 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
- 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
- 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
- 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
- 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
- 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
- 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
- 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
- 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
- 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
- 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
- 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
- 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
- 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
- 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
- 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
- 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
- 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
- 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
- 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
- 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
- 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
- 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
- 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
- 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
- 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
- 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
- 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
- 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
- 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
- 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
- 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
- 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
- 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
- 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
- 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
- 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
- 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
- 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
- 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
- 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
- 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
- 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
- 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
- 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
- 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
- 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
- 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
- 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
- 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
- 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
- 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
- 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
- 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
- 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
- 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
- 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
- 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
- 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
- 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
- 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
- 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
- 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
- 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
- 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
- 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
- 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
- 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
- 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
- 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
- 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
- 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
- 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
- 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
- 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
- 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
- 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
- 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
- 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
- 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
- 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
- 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
- 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
- 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
- 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
- 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
- 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
- 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
- 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
- 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
- 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
- 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
- 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
- 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
- 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
- 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
- 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
- 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
- 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
- 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
- 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
- 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
- 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
- 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
- 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
- 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
- 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
- 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
- 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
- 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
- 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
- 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
- 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
- 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
- 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
- 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
- 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
- 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
- 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
- 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
- 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
- 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
- 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
- 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
- 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
- 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
- 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
- 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
- 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
- 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
- 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
- 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
- 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
- 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
- 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
- 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
- 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
- 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
- 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
- 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
- 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
- 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
- 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
- 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
- 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
- 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
- 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
- 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
- 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
- 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
- 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
- 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
- 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
- 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
- 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
- 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
- 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
- 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
- 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
- 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
- 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
- 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
- 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
- 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
- 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
- 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
- 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
- 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
- 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
- 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
- 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
- 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
- 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
- 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
- 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
- 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
- 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
- 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
- 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
- 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
- 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
- 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
- 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
- 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
- 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
- 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
- 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
- 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
- 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
- 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
- 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
- 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
- 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
- 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
- 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
- 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
- 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
- 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
- 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
- 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
- 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
- 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
- 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
- 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
- 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
- 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
- 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
- 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
- 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
- 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
- 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
- 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
- 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
- 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
- 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
- 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
- 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
- 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
- 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
- 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
- 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
- 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
- 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
- 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
- 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
- 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
- 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
- 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
- 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
- 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
- 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
- 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
- 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
- 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
- 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
- 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
- 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
- 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
- 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
- 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
- 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
- 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
- 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
- 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
- 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
- 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
- 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
- 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
- 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
- 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
- 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
- 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
- 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
- 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
- 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
- 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
- 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
- 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
- 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
- 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
- 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
- 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
- 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
- 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
- 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
- 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
- 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
- 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
- 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
- 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
- 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
- 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
- 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
- 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
- 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
- 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
- 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
- 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
- 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
- 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
- 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
- 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
- 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
- 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
- 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
- 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
- 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
- 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
- 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
- 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
- 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
- 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
- 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
- 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
- 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
- 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
- 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
- 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
- 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
- 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
- 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
- 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
- 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
- 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
- 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
- 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
- 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
- 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
- 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
- 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
- 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
- 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
- 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
- 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
- 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
- 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
- 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
- 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
- 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
- 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
- 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
- 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
- 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
- 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
- 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
- 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
- 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
- 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
- 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
- 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
- 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
- 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
- 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
- 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
- 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
- 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
- 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
- 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
- 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
- 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
- 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
- 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
- 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
- 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
- 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
- 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
- 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
- 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
- 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
- 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
- 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
- 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
- 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
- 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
- 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
- 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
- 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
- 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
- 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
- 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
- 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
- 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
- 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
- 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
- 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
- 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
- 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
- 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
- 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
- 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
- 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
- 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
- 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
- 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
- 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
- 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
- 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
- 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
- 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
- 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
- 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
- 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
- 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
- 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
- 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
- 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
- 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
- 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
- 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
- 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
- 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
- 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
- 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
- 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
- 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
- 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
- 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
- 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
- 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
- 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
- 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
- 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
- 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
- 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
- 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
- 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
- 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
- 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
- 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
- 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
- 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
- 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
- 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
- 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
- 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
- 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
- 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
- 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
- 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
- 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
- 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
- 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
- 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
- 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
- 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
- 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
- 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
- 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
- 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
- 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
- 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
- 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
- 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
- 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
- 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
- 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
- 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
- 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
- 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
- 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
- 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
- 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
- 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
- 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
- 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
- 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
- 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
- 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
- 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
- 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
- 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
- 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
- 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
- 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
- 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
- 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
- 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
- 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
- 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
- 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
- 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
- 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
- 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
- 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
- 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
- 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
- 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
- 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
- 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
- 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
- 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
- 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
- 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
- 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
- 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
- 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
- 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
- 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
- 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
- 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
- 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
- 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
- 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
- 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
- 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
- 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
- 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
- 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
- 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
- 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
- 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
- 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
- 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
- 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
- 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
- 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
- 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
- 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
- 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
- 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
- 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
- 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
- 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
- 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
- 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
- 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
- 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
- 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
- 835584U, // <u,1,2,3>: Cost 0 copy LHS
- 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
- 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
- 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
- 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
- 835584U, // <u,1,2,u>: Cost 0 copy LHS
- 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
- 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
- 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
- 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
- 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
- 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
- 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
- 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
- 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
- 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
- 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
- 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
- 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
- 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
- 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
- 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
- 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
- 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
- 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
- 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
- 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
- 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
- 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
- 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
- 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
- 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
- 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
- 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
- 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
- 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
- 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
- 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
- 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
- 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
- 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
- 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
- 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
- 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
- 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
- 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
- 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
- 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
- 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
- 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
- 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
- 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
- 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
- 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
- 835584U, // <u,1,u,3>: Cost 0 copy LHS
- 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
- 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
- 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
- 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
- 835584U, // <u,1,u,u>: Cost 0 copy LHS
- 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
- 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
- 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
- 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
- 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
- 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
- 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
- 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
- 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
- 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
- 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
- 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
- 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
- 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
- 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
- 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
- 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
- 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
- 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
- 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
- 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
- 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
- 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
- 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
- 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
- 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
- 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
- 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
- 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
- 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
- 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
- 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
- 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
- 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
- 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
- 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
- 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
- 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
- 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
- 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
- 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
- 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
- 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
- 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
- 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
- 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
- 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
- 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
- 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
- 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
- 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
- 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
- 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
- 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
- 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
- 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
- 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
- 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
- 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
- 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
- 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
- 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
- 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
- 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
- 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
- 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
- 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
- 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
- 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
- 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
- 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
- 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
- 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
- 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
- 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
- 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
- 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
- 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
- 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
- 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
- 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
- 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
- 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
- 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
- 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
- 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
- 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
- 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
- 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
- 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
- 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
- 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
- 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
- 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
- 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
- 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
- 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
- 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
- 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
- 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
- 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
- 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
- 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
- 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
- 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
- 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
- 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
- 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
- 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
- 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
- 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
- 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
- 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
- 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
- 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
- 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
- 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
- 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
- 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
- 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
- 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
- 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
- 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
- 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
- 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
- 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
- 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
- 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
- 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
- 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
- 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
- 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
- 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
- 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
- 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
- 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
- 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
- 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
- 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
- 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
- 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
- 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
- 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
- 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
- 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
- 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
- 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
- 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
- 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
- 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
- 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
- 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
- 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
- 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
- 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
- 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
- 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
- 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
- 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
- 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
- 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
- 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
- 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
- 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
- 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
- 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
- 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
- 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
- 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
- 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
- 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
- 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
- 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
- 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
- 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
- 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
- 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
- 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
- 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
- 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
- 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
- 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
- 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
- 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
- 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
- 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
- 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
- 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
- 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
- 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
- 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
- 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
- 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
- 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
- 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
- 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
- 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
- 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
- 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
- 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
- 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
- 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
- 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
- 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
- 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
- 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
- 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
- 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
- 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
- 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
- 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
- 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
- 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
- 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
- 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
- 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
- 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
- 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
- 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
- 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
- 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
- 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
- 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
- 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
- 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
- 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
- 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
- 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
- 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
- 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
- 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
- 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
- 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
- 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
- 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
- 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
- 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
- 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
- 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
- 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
- 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
- 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
- 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
- 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
- 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
- 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
- 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
- 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
- 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
- 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
- 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
- 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
- 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
- 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
- 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
- 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
- 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
- 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
- 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
- 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
- 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
- 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
- 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
- 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
- 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
- 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
- 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
- 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
- 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
- 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
- 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
- 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
- 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
- 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
- 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
- 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
- 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
- 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
- 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
- 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
- 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
- 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
- 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
- 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
- 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
- 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
- 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
- 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
- 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
- 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
- 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
- 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
- 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
- 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
- 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
- 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
- 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
- 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
- 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
- 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
- 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
- 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
- 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
- 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
- 27705344U, // <u,5,6,7>: Cost 0 copy RHS
- 27705344U, // <u,5,6,u>: Cost 0 copy RHS
- 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
- 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
- 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
- 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
- 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
- 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
- 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
- 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
- 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
- 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
- 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
- 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
- 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
- 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
- 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
- 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
- 27705344U, // <u,5,u,7>: Cost 0 copy RHS
- 27705344U, // <u,5,u,u>: Cost 0 copy RHS
- 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
- 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
- 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
- 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
- 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
- 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
- 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
- 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
- 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
- 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
- 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
- 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
- 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
- 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
- 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
- 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
- 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
- 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
- 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
- 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
- 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
- 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
- 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
- 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
- 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
- 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
- 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
- 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
- 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
- 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
- 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
- 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
- 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
- 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
- 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
- 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
- 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
- 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
- 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
- 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
- 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
- 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
- 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
- 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
- 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
- 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
- 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
- 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
- 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
- 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
- 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
- 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
- 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
- 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
- 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
- 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
- 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
- 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
- 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
- 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
- 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
- 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
- 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
- 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
- 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
- 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
- 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
- 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
- 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
- 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
- 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
- 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
- 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
- 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
- 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
- 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
- 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
- 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
- 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
- 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
- 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
- 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
- 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
- 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
- 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
- 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
- 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
- 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
- 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
- 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
- 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
- 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
- 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
- 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
- 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
- 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
- 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
- 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
- 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
- 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
- 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
- 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
- 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
- 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
- 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
- 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
- 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
- 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
- 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
- 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
- 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
- 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
- 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
- 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
- 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
- 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
- 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
- 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
- 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
- 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
- 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
- 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
- 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
- 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
- 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
- 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
- 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
- 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
- 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
- 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
- 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
- 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
- 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
- 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
- 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
- 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
- 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
- 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
- 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
- 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
- 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
- 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
- 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
- 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
- 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
- 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
- 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
- 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
- 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
- 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
- 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
- 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
- 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
- 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
- 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
- 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
- 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
- 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
- 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
- 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
- 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
- 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
- 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
- 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
- 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
- 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
- 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
- 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
- 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
- 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
- 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
- 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
- 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
- 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
- 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
- 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
- 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
- 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
- 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
- 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
- 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
- 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
- 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
- 835584U, // <u,u,2,3>: Cost 0 copy LHS
- 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
- 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
- 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
- 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
- 835584U, // <u,u,2,u>: Cost 0 copy LHS
- 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
- 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
- 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
- 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
- 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
- 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
- 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
- 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
- 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
- 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
- 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
- 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
- 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
- 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
- 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
- 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
- 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
- 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
- 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
- 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
- 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
- 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
- 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
- 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
- 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
- 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
- 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
- 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
- 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
- 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
- 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
- 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
- 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
- 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
- 27705344U, // <u,u,6,7>: Cost 0 copy RHS
- 27705344U, // <u,u,6,u>: Cost 0 copy RHS
- 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
- 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
- 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
- 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
- 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
- 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
- 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
- 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
- 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
- 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
- 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
- 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
- 835584U, // <u,u,u,3>: Cost 0 copy LHS
- 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
- 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
- 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
- 27705344U, // <u,u,u,7>: Cost 0 copy RHS
- 835584U, // <u,u,u,u>: Cost 0 copy LHS
- 0
-};
+static const unsigned PerfectShuffleTable[6561 + 1] = {
+ 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+ 2080972802U, // <0,0,0,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 1679065190U, // <0,0,0,2>: Cost 2 vuzpl <0,2,0,2>, LHS
+ 2085707777U, // <0,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+ 2080440323U, // <0,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3
+ 2080440323U, // <0,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3
+ 2080440323U, // <0,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3
+ 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+ 1812774912U, // <0,0,1,0>: Cost 2 vzipl LHS, <0,0,0,0>
+ 739033190U, // <0,0,1,1>: Cost 1 vzipl LHS, LHS
+ 1812775076U, // <0,0,1,2>: Cost 2 vzipl LHS, <0,2,0,2>
+ 2080514051U, // <0,0,1,3>: Cost 2 ins <0,0,1,u>, lane 3
+ 1812816210U, // <0,0,1,4>: Cost 2 vzipl LHS, <0,4,1,5>
+ 2085797889U, // <0,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 2080514051U, // <0,0,1,6>: Cost 2 ins <0,0,1,u>, lane 3
+ 2080514051U, // <0,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3
+ 739033757U, // <0,0,1,u>: Cost 1 vzipl LHS, LHS
+ 1946992640U, // <0,0,2,0>: Cost 2 vtrnl LHS, <0,0,0,0>
+ 1946992650U, // <0,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1>
+ 873250918U, // <0,0,2,2>: Cost 1 vtrnl LHS, LHS
+ 1012113409U, // <0,0,2,3>: Cost 1 ins LHS, lane 1
+ 1946992844U, // <0,0,2,4>: Cost 2 vtrnl LHS, <0,2,4,6>
+ 2080587779U, // <0,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3
+ 2085879809U, // <0,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2080587779U, // <0,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3
+ 873250972U, // <0,0,2,u>: Cost 1 vtrnl LHS, LHS
+ 2080964610U, // <0,0,3,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 2080972802U, // <0,0,3,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 2128388096U, // <0,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 2013437973U, // <0,0,3,3>: Cost 2 vtrnr <0,0,2,3>, <0,0,2,3>
+ 3154739202U, // <0,0,3,4>: Cost 3 ins <0,0,u,4>, lane 2
+ 2752809474U, // <0,0,3,5>: Cost 3 vuzpl <0,2,0,2>, <3,4,5,6>
+ 3154755586U, // <0,0,3,6>: Cost 3 ins <0,0,u,6>, lane 2
+ 2818573312U, // <0,0,3,7>: Cost 3 vuzpr <0,0,0,0>, <1,3,5,7>
+ 2080972802U, // <0,0,3,u>: Cost 2 ins <0,0,u,1>, lane 2
+ 2080964610U, // <0,0,4,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 1814708326U, // <0,0,4,1>: Cost 2 vzipl <0,4,1,5>, LHS
+ 1947828326U, // <0,0,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS
+ 2086002689U, // <0,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 1947828428U, // <0,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+ 2081030149U, // <0,0,4,5>: Cost 2 ins <0,0,u,u>, lane 5
+ 1679068470U, // <0,0,4,6>: Cost 2 vuzpl <0,2,0,2>, RHS
+ 3154477059U, // <0,0,4,7>: Cost 3 ins <0,0,4,u>, lane 3
+ 1679068488U, // <0,0,4,u>: Cost 2 vuzpl <0,2,0,2>, RHS
+ 2080964610U, // <0,0,5,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 2128527360U, // <0,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 2080980994U, // <0,0,5,2>: Cost 2 ins <0,0,u,2>, lane 2
+ 2086076417U, // <0,0,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 3202293760U, // <0,0,5,4>: Cost 3 ins <u,0,5,4>, lane 0
+ 1947213953U, // <0,0,5,5>: Cost 2 vtrnl <0,1,5,3>, <0,1,5,3>
+ 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+ 1744833846U, // <0,0,5,7>: Cost 2 vuzpr <0,0,0,0>, RHS
+ 2128527360U, // <0,0,5,u>: Cost 2 ins <u,0,5,1>, lane 0
+ 2080964610U, // <0,0,6,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 2080972802U, // <0,0,6,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 2128609280U, // <0,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 2086150145U, // <0,0,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 3202367488U, // <0,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+ 2617250536U, // <0,0,6,5>: Cost 3 vext2 <0,0,0,0>, <6,5,6,7>
+ 1947287690U, // <0,0,6,6>: Cost 2 vtrnl <0,1,6,3>, <0,1,6,3>
+ 2081030149U, // <0,0,6,7>: Cost 2 ins <0,0,u,u>, lane 5
+ 2080972802U, // <0,0,6,u>: Cost 2 ins <0,0,u,1>, lane 2
+ 2080964610U, // <0,0,7,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 2080972802U, // <0,0,7,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 2080980994U, // <0,0,7,2>: Cost 2 ins <0,0,u,2>, lane 2
+ 2086223873U, // <0,0,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 3154739202U, // <0,0,7,4>: Cost 3 ins <0,0,u,4>, lane 2
+ 2617251265U, // <0,0,7,5>: Cost 3 vext2 <0,0,0,0>, <7,5,6,7>
+ 3154755586U, // <0,0,7,6>: Cost 3 ins <0,0,u,6>, lane 2
+ 1947361427U, // <0,0,7,7>: Cost 2 vtrnl <0,1,7,3>, <0,1,7,3>
+ 2080972802U, // <0,0,7,u>: Cost 2 ins <0,0,u,1>, lane 2
+ 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+ 743678054U, // <0,0,u,1>: Cost 1 vzipl LHS, LHS
+ 873693286U, // <0,0,u,2>: Cost 1 vtrnl LHS, LHS
+ 1012113409U, // <0,0,u,3>: Cost 1 ins LHS, lane 1
+ 1947435212U, // <0,0,u,4>: Cost 2 vtrnl LHS, <0,2,4,6>
+ 2085797889U, // <0,0,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 1679071386U, // <0,0,u,6>: Cost 2 vuzpl <0,2,0,2>, RHS
+ 2080514051U, // <0,0,u,7>: Cost 2 ins <0,0,1,u>, lane 3
+ 873693340U, // <0,0,u,u>: Cost 1 vtrnl LHS, LHS
+ 2085683201U, // <0,1,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 1007951877U, // <0,1,0,1>: Cost 1 ins LHS, lane 5
+ 1680490598U, // <0,1,0,2>: Cost 2 vuzpl <0,4,1,5>, LHS
+ 1007910914U, // <0,1,0,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <0,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 2081669122U, // <0,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2
+ 2081677314U, // <0,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2
+ 2081685506U, // <0,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007951877U, // <0,1,0,u>: Cost 1 ins LHS, lane 5
+ 1812775670U, // <0,1,1,0>: Cost 2 vzipl LHS, <1,0,3,2>
+ 1812775732U, // <0,1,1,1>: Cost 2 vzipl LHS, <1,1,1,1>
+ 1812775830U, // <0,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0>
+ 1007910914U, // <0,1,1,3>: Cost 1 ins LHS, lane 2
+ 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+ 1812817040U, // <0,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7>
+ 2081677314U, // <0,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2
+ 2081685506U, // <0,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007910914U, // <0,1,1,u>: Cost 1 ins LHS, lane 2
+ 1007509507U, // <0,1,2,0>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,2,1>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,2,2>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,1,2,3>: Cost 0 copy LHS
+ 1007509507U, // <0,1,2,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,2,5>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,2,6>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,2,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,1,2,u>: Cost 0 copy LHS
+ 2133680132U, // <0,1,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+ 2081636354U, // <0,1,3,1>: Cost 2 ins <0,1,u,1>, lane 2
+ 2133696516U, // <0,1,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+ 1007910914U, // <0,1,3,3>: Cost 1 ins LHS, lane 2
+ 2133712900U, // <0,1,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+ 2081669122U, // <0,1,3,5>: Cost 2 ins <0,1,u,5>, lane 2
+ 2081677314U, // <0,1,3,6>: Cost 2 ins <0,1,u,6>, lane 2
+ 2133737476U, // <0,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+ 1007910914U, // <0,1,3,u>: Cost 1 ins LHS, lane 2
+ 2081628162U, // <0,1,4,0>: Cost 2 ins <0,1,u,0>, lane 2
+ 2081636354U, // <0,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2
+ 2081644546U, // <0,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2
+ 1007910914U, // <0,1,4,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <0,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 1007951877U, // <0,1,4,5>: Cost 1 ins LHS, lane 5
+ 1680493878U, // <0,1,4,6>: Cost 2 vuzpl <0,4,1,5>, RHS
+ 2081685506U, // <0,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007910914U, // <0,1,4,u>: Cost 1 ins LHS, lane 2
+ 2081628162U, // <0,1,5,0>: Cost 2 ins <0,1,u,0>, lane 2
+ 2133835780U, // <0,1,5,1>: Cost 2 ins <u,u,5,1>, lane 4
+ 2081644546U, // <0,1,5,2>: Cost 2 ins <0,1,u,2>, lane 2
+ 1007910914U, // <0,1,5,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <0,1,5,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 2133868548U, // <0,1,5,5>: Cost 2 ins <u,u,5,5>, lane 4
+ 2133876740U, // <0,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4
+ 2133884932U, // <0,1,5,7>: Cost 2 ins <u,u,5,7>, lane 4
+ 1007910914U, // <0,1,5,u>: Cost 1 ins LHS, lane 2
+ 2081628162U, // <0,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2
+ 2081636354U, // <0,1,6,1>: Cost 2 ins <0,1,u,1>, lane 2
+ 2133917700U, // <0,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4
+ 1007910914U, // <0,1,6,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <0,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 2081669122U, // <0,1,6,5>: Cost 2 ins <0,1,u,5>, lane 2
+ 2133950468U, // <0,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+ 1060216836U, // <0,1,6,7>: Cost 1 ins RHS, lane 4
+ 1007910914U, // <0,1,6,u>: Cost 1 ins LHS, lane 2
+ 2133975044U, // <0,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+ 2081636354U, // <0,1,7,1>: Cost 2 ins <0,1,u,1>, lane 2
+ 2081644546U, // <0,1,7,2>: Cost 2 ins <0,1,u,2>, lane 2
+ 1007910914U, // <0,1,7,3>: Cost 1 ins LHS, lane 2
+ 2134007812U, // <0,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+ 2081669122U, // <0,1,7,5>: Cost 2 ins <0,1,u,5>, lane 2
+ 2134024196U, // <0,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 2134032388U, // <0,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+ 1007910914U, // <0,1,7,u>: Cost 1 ins LHS, lane 2
+ 1007509507U, // <0,1,u,0>: Cost 1 ins LHS, lane 3
+ 1007951877U, // <0,1,u,1>: Cost 1 ins LHS, lane 5
+ 1007509507U, // <0,1,u,2>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,1,u,3>: Cost 0 copy LHS
+ 1007509507U, // <0,1,u,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,u,5>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,u,6>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,1,u,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,1,u,u>: Cost 0 copy LHS
+ 1678557184U, // <0,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0>
+ 1678598154U, // <0,2,0,1>: Cost 2 vuzpl LHS, <0,0,1,1>
+ 604815462U, // <0,2,0,2>: Cost 1 vuzpl LHS, LHS
+ 2081767427U, // <0,2,0,3>: Cost 2 ins <0,2,0,u>, lane 3
+ 1678598348U, // <0,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+ 2081767427U, // <0,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+ 2082340866U, // <0,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 2081767427U, // <0,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+ 604815516U, // <0,2,0,u>: Cost 1 vuzpl LHS, LHS
+ 2752340940U, // <0,2,1,0>: Cost 3 vuzpl LHS, <1,3,0,0>
+ 1678558004U, // <0,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1>
+ 1812776552U, // <0,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2>
+ 1678557942U, // <0,2,1,3>: Cost 2 vuzpl LHS, <1,0,3,2>
+ 2752340982U, // <0,2,1,4>: Cost 3 vuzpl LHS, <1,3,4,6>
+ 1678599168U, // <0,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7>
+ 1812817850U, // <0,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7>
+ 2860466282U, // <0,2,1,7>: Cost 3 vuzpr <7,0,1,2>, <0,1,2,7>
+ 1678598947U, // <0,2,1,u>: Cost 2 vuzpl LHS, <1,0,u,2>
+ 1678558886U, // <0,2,2,0>: Cost 2 vuzpl LHS, <2,3,0,1>
+ 2085838849U, // <0,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+ 1678558824U, // <0,2,2,2>: Cost 2 vuzpl LHS, <2,2,2,2>
+ 1012113409U, // <0,2,2,3>: Cost 1 ins LHS, lane 1
+ 1678558926U, // <0,2,2,4>: Cost 2 vuzpl LHS, <2,3,4,5>
+ 2085871617U, // <0,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+ 2085879809U, // <0,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2085888001U, // <0,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+ 1012113409U, // <0,2,2,u>: Cost 1 ins LHS, lane 1
+ 2129698816U, // <0,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 1678559382U, // <0,2,3,1>: Cost 2 vuzpl LHS, <3,0,1,2>
+ 2082308098U, // <0,2,3,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 1678559644U, // <0,2,3,3>: Cost 2 vuzpl LHS, <3,3,3,3>
+ 2129731584U, // <0,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 1678559746U, // <0,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+ 2082340866U, // <0,2,3,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 2824782848U, // <0,2,3,7>: Cost 3 vuzpr <1,0,3,2>, <1,3,5,7>
+ 1678559445U, // <0,2,3,u>: Cost 2 vuzpl LHS, <3,0,u,2>
+ 2082062339U, // <0,2,4,0>: Cost 2 ins <0,2,4,u>, lane 3
+ 2082062339U, // <0,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3
+ 2082308098U, // <0,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 2082062339U, // <0,2,4,3>: Cost 2 ins <0,2,4,u>, lane 3
+ 2082062339U, // <0,2,4,4>: Cost 2 ins <0,2,4,u>, lane 3
+ 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+ 604818742U, // <0,2,4,6>: Cost 1 vuzpl LHS, RHS
+ 2082062339U, // <0,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+ 604818760U, // <0,2,4,u>: Cost 1 vuzpl LHS, RHS
+ 3105260438U, // <0,2,5,0>: Cost 3 vtrnr <3,0,4,5>, <1,2,3,0>
+ 1678561408U, // <0,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3>
+ 2082308098U, // <0,2,5,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 2086076417U, // <0,2,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 2756947554U, // <0,2,5,4>: Cost 3 vuzpl LHS, <5,0,4,1>
+ 1678561284U, // <0,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5>
+ 2082340866U, // <0,2,5,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 1751043382U, // <0,2,5,7>: Cost 2 vuzpr <1,0,3,2>, RHS
+ 1751043383U, // <0,2,5,u>: Cost 2 vuzpr <1,0,3,2>, RHS
+ 1678562126U, // <0,2,6,0>: Cost 2 vuzpl LHS, <6,7,0,1>
+ 2756948257U, // <0,2,6,1>: Cost 3 vuzpl LHS, <6,0,1,2>
+ 2082308098U, // <0,2,6,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 2086150145U, // <0,2,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 1678562166U, // <0,2,6,4>: Cost 2 vuzpl LHS, <6,7,4,5>
+ 2756948621U, // <0,2,6,5>: Cost 3 vuzpl LHS, <6,4,5,6>
+ 2082340866U, // <0,2,6,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 2082357253U, // <0,2,6,7>: Cost 2 ins <0,2,u,u>, lane 5
+ 2082308098U, // <0,2,6,u>: Cost 2 ins <0,2,u,2>, lane 2
+ 3099378582U, // <0,2,7,0>: Cost 3 vtrnr <2,0,5,7>, <1,2,3,0>
+ 1678562298U, // <0,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2>
+ 2082308098U, // <0,2,7,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 2130018304U, // <0,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+ 2645136742U, // <0,2,7,4>: Cost 3 vext2 <4,6,0,2>, <7,4,5,6>
+ 1678562662U, // <0,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6>
+ 2082340866U, // <0,2,7,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 1678562924U, // <0,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7>
+ 2082308098U, // <0,2,7,u>: Cost 2 ins <0,2,u,2>, lane 2
+ 1947436710U, // <0,2,u,0>: Cost 2 vtrnl LHS, <2,3,0,1>
+ 1678603987U, // <0,2,u,1>: Cost 2 vuzpl LHS, <u,0,1,2>
+ 604821294U, // <0,2,u,2>: Cost 1 vuzpl LHS, LHS
+ 1012113409U, // <0,2,u,3>: Cost 1 ins LHS, lane 1
+ 1947436750U, // <0,2,u,4>: Cost 2 vtrnl LHS, <2,3,4,5>
+ 1678604351U, // <0,2,u,5>: Cost 2 vuzpl LHS, <u,4,5,6>
+ 604821658U, // <0,2,u,6>: Cost 1 vuzpl LHS, RHS
+ 1751043625U, // <0,2,u,7>: Cost 2 vuzpr <1,0,3,2>, RHS
+ 604821348U, // <0,2,u,u>: Cost 1 vuzpl LHS, LHS
+ 2085683201U, // <0,3,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 2130149376U, // <0,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+ 2085699585U, // <0,3,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 1745002517U, // <0,3,0,3>: Cost 2 vuzpr <0,0,2,3>, <0,0,2,3>
+ 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+ 3021244930U, // <0,3,0,5>: Cost 3 vtrnl <0,2,0,2>, <3,4,5,6>
+ 3159474177U, // <0,3,0,6>: Cost 3 ins <0,u,0,6>, lane 1
+ 2952791184U, // <0,3,0,7>: Cost 3 vzipr <0,0,0,0>, <1,5,3,7>
+ 2130149376U, // <0,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+ 1812777110U, // <0,3,1,0>: Cost 2 vzipl LHS, <3,0,1,2>
+ 2085765121U, // <0,3,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 2886519105U, // <0,3,1,2>: Cost 3 vzipl LHS, <3,2,2,2>
+ 1812777372U, // <0,3,1,3>: Cost 2 vzipl LHS, <3,3,3,3>
+ 1812777474U, // <0,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6>
+ 2085797889U, // <0,3,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 3159547905U, // <0,3,1,6>: Cost 3 ins <0,u,1,6>, lane 1
+ 2966733968U, // <0,3,1,7>: Cost 3 vzipr <2,3,0,1>, <1,5,3,7>
+ 1812777758U, // <0,3,1,u>: Cost 2 vzipl LHS, <3,u,1,2>
+ 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+ 1946994838U, // <0,3,2,1>: Cost 2 vtrnl LHS, <3,0,1,2>
+ 2085847041U, // <0,3,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <0,3,2,3>: Cost 1 ins LHS, lane 1
+ 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+ 1946995202U, // <0,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+ 2085879809U, // <0,3,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2085888001U, // <0,3,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+ 1012113409U, // <0,3,2,u>: Cost 1 ins LHS, lane 1
+ 2887747734U, // <0,3,3,0>: Cost 3 vzipl <0,3,1,0>, <3,0,1,2>
+ 2753022102U, // <0,3,3,1>: Cost 3 vuzpl <0,2,3,1>, <3,0,1,2>
+ 2965422838U, // <0,3,3,2>: Cost 3 vzipr <2,1,0,3>, <1,0,3,2>
+ 2130386944U, // <0,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+ 2887780866U, // <0,3,3,4>: Cost 3 vzipl <0,3,1,4>, <3,4,5,6>
+ 2753055234U, // <0,3,3,5>: Cost 3 vuzpl <0,2,3,5>, <3,4,5,6>
+ 2752375389U, // <0,3,3,6>: Cost 3 vuzpl <0,1,3,3>, <3,5,6,7>
+ 3204161536U, // <0,3,3,7>: Cost 3 ins <u,3,3,7>, lane 0
+ 2130386944U, // <0,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+ 2888452246U, // <0,3,4,0>: Cost 3 vzipl <0,4,1,5>, <3,0,1,2>
+ 3021572246U, // <0,3,4,1>: Cost 3 vtrnl <0,2,4,6>, <3,0,1,2>
+ 3021572257U, // <0,3,4,2>: Cost 3 vtrnl <0,2,4,6>, <3,0,2,4>
+ 2086002689U, // <0,3,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 2888452610U, // <0,3,4,4>: Cost 3 vzipl <0,4,1,5>, <3,4,5,6>
+ 2130477056U, // <0,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+ 2086027265U, // <0,3,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+ 2818747621U, // <0,3,4,7>: Cost 3 vuzpr <0,0,2,3>, <4,4,6,7>
+ 2130477056U, // <0,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+ 3204251648U, // <0,3,5,0>: Cost 3 ins <u,3,5,0>, lane 0
+ 3204259840U, // <0,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0
+ 2961457910U, // <0,3,5,2>: Cost 3 vzipr <1,4,0,5>, <1,0,3,2>
+ 2086076417U, // <0,3,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+ 3204292608U, // <0,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0
+ 2653769826U, // <0,3,5,6>: Cost 3 vext2 <6,1,0,3>, <5,6,7,0>
+ 2130567168U, // <0,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+ 2130567168U, // <0,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+ 2854506594U, // <0,3,6,0>: Cost 3 vuzpr <6,0,1,3>, <5,6,7,0>
+ 2653770090U, // <0,3,6,1>: Cost 3 vext2 <6,1,0,3>, <6,1,0,3>
+ 3204341760U, // <0,3,6,2>: Cost 3 ins <u,3,6,2>, lane 0
+ 2086150145U, // <0,3,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 3204358144U, // <0,3,6,4>: Cost 3 ins <u,3,6,4>, lane 0
+ 3204366336U, // <0,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+ 3204374528U, // <0,3,6,6>: Cost 3 ins <u,3,6,6>, lane 0
+ 2130640896U, // <0,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+ 2086150145U, // <0,3,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+ 2968109974U, // <0,3,7,0>: Cost 3 vzipr <2,5,0,7>, <1,2,3,0>
+ 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+ 2660406420U, // <0,3,7,2>: Cost 3 vext2 <7,2,0,3>, <7,2,0,3>
+ 2086223873U, // <0,3,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 3204431872U, // <0,3,7,4>: Cost 3 ins <u,3,7,4>, lane 0
+ 3204440064U, // <0,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0
+ 2752378305U, // <0,3,7,6>: Cost 3 vuzpl <0,1,3,3>, <7,5,6,7>
+ 3204456448U, // <0,3,7,7>: Cost 3 ins <u,3,7,7>, lane 0
+ 2086223873U, // <0,3,7,u>: Cost 2 ins <0,u,7,3>, lane 1
+ 1817421974U, // <0,3,u,0>: Cost 2 vzipl LHS, <3,0,1,2>
+ 1947437206U, // <0,3,u,1>: Cost 2 vtrnl LHS, <3,0,1,2>
+ 2085699585U, // <0,3,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 1012113409U, // <0,3,u,3>: Cost 1 ins LHS, lane 1
+ 1817422338U, // <0,3,u,4>: Cost 2 vzipl LHS, <3,4,5,6>
+ 1947437570U, // <0,3,u,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+ 2085879809U, // <0,3,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2130567168U, // <0,3,u,7>: Cost 2 ins <u,3,5,7>, lane 0
+ 1012113409U, // <0,3,u,u>: Cost 1 ins LHS, lane 1
+ 2085683201U, // <0,4,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 2083684357U, // <0,4,0,1>: Cost 2 ins <0,4,u,u>, lane 5
+ 1679392870U, // <0,4,0,2>: Cost 2 vuzpl <0,2,4,6>, LHS
+ 2085707777U, // <0,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 1679392972U, // <0,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+ 2083659778U, // <0,4,0,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 1947503926U, // <0,4,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS
+ 3156836355U, // <0,4,0,7>: Cost 3 ins <0,4,0,u>, lane 3
+ 1947503944U, // <0,4,0,u>: Cost 2 vtrnl <0,2,0,2>, RHS
+ 2083168259U, // <0,4,1,0>: Cost 2 ins <0,4,1,u>, lane 3
+ 2085765121U, // <0,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 2083168259U, // <0,4,1,2>: Cost 2 ins <0,4,1,u>, lane 3
+ 2083168259U, // <0,4,1,3>: Cost 2 ins <0,4,1,u>, lane 3
+ 2083168259U, // <0,4,1,4>: Cost 2 ins <0,4,1,u>, lane 3
+ 739036470U, // <0,4,1,5>: Cost 1 vzipl LHS, RHS
+ 1948929334U, // <0,4,1,6>: Cost 2 vtrnl <0,4,1,5>, RHS
+ 2083168259U, // <0,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3
+ 739036713U, // <0,4,1,u>: Cost 1 vzipl LHS, RHS
+ 2083241987U, // <0,4,2,0>: Cost 2 ins <0,4,2,u>, lane 3
+ 2083241987U, // <0,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3
+ 2085847041U, // <0,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <0,4,2,3>: Cost 1 ins LHS, lane 1
+ 2083241987U, // <0,4,2,4>: Cost 2 ins <0,4,2,u>, lane 3
+ 1813286198U, // <0,4,2,5>: Cost 2 vzipl <0,2,0,2>, RHS
+ 873254198U, // <0,4,2,6>: Cost 1 vtrnl LHS, RHS
+ 2083241987U, // <0,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3
+ 873254216U, // <0,4,2,u>: Cost 1 vtrnl LHS, RHS
+ 3020811514U, // <0,4,3,0>: Cost 3 vtrnl <0,1,3,3>, <4,5,0,1>
+ 2753136790U, // <0,4,3,1>: Cost 3 vuzpl <0,2,4,6>, <3,0,1,2>
+ 2753136801U, // <0,4,3,2>: Cost 3 vuzpl <0,2,4,6>, <3,0,2,4>
+ 2085928961U, // <0,4,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+ 3204800512U, // <0,4,3,4>: Cost 3 ins <u,4,3,4>, lane 0
+ 2083659778U, // <0,4,3,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 2083667970U, // <0,4,3,6>: Cost 2 ins <0,4,u,6>, lane 2
+ 3087183077U, // <0,4,3,7>: Cost 3 vtrnr <0,0,2,3>, <4,4,6,7>
+ 2083659778U, // <0,4,3,u>: Cost 2 ins <0,4,u,5>, lane 2
+ 2753137995U, // <0,4,4,0>: Cost 3 vuzpl <0,2,4,6>, <4,6,0,1>
+ 2888453090U, // <0,4,4,1>: Cost 3 vzipl <0,4,1,5>, <4,1,5,0>
+ 2888535100U, // <0,4,4,2>: Cost 3 vzipl <0,4,2,6>, <4,2,6,0>
+ 2086002689U, // <0,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 2131132416U, // <0,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+ 1814711606U, // <0,4,4,5>: Cost 2 vzipl <0,4,1,5>, RHS
+ 1679396150U, // <0,4,4,6>: Cost 2 vuzpl <0,2,4,6>, RHS
+ 3157131267U, // <0,4,4,7>: Cost 3 ins <0,4,4,u>, lane 3
+ 1679396168U, // <0,4,4,u>: Cost 2 vuzpl <0,2,4,6>, RHS
+ 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+ 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+ 3204931584U, // <0,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0
+ 2086076417U, // <0,4,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+ 2131214336U, // <0,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+ 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2830699830U, // <0,4,5,7>: Cost 3 vuzpr <2,0,2,4>, RHS
+ 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+ 2712227146U, // <0,4,6,0>: Cost 3 vext3 <4,6,0,0>, <4,6,0,0>
+ 2753138977U, // <0,4,6,1>: Cost 3 vuzpl <0,2,4,6>, <6,0,1,2>
+ 2753138988U, // <0,4,6,2>: Cost 3 vuzpl <0,2,4,6>, <6,0,2,4>
+ 2086150145U, // <0,4,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 2712522094U, // <0,4,6,4>: Cost 3 vext3 <4,6,4,0>, <4,6,4,0>
+ 2083659778U, // <0,4,6,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 2131296256U, // <0,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2083684357U, // <0,4,6,7>: Cost 2 ins <0,4,u,u>, lane 5
+ 2083659778U, // <0,4,6,u>: Cost 2 ins <0,4,u,5>, lane 2
+ 3021106426U, // <0,4,7,0>: Cost 3 vtrnl <0,1,7,3>, <4,5,0,1>
+ 2860487502U, // <0,4,7,1>: Cost 3 vuzpr <7,0,1,4>, <6,7,0,1>
+ 3157377026U, // <0,4,7,2>: Cost 3 ins <0,4,u,2>, lane 2
+ 2086223873U, // <0,4,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 3205095424U, // <0,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0
+ 2083659778U, // <0,4,7,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 2131369984U, // <0,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 2752452204U, // <0,4,7,7>: Cost 3 vuzpl <0,1,4,3>, <7,7,7,7>
+ 2083659778U, // <0,4,7,u>: Cost 2 ins <0,4,u,5>, lane 2
+ 2083168259U, // <0,4,u,0>: Cost 2 ins <0,4,1,u>, lane 3
+ 2083684357U, // <0,4,u,1>: Cost 2 ins <0,4,u,u>, lane 5
+ 1679398702U, // <0,4,u,2>: Cost 2 vuzpl <0,2,4,6>, LHS
+ 1012113409U, // <0,4,u,3>: Cost 1 ins LHS, lane 1
+ 1679392972U, // <0,4,u,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+ 743681334U, // <0,4,u,5>: Cost 1 vzipl LHS, RHS
+ 873696566U, // <0,4,u,6>: Cost 1 vtrnl LHS, RHS
+ 2083168259U, // <0,4,u,7>: Cost 2 ins <0,4,1,u>, lane 3
+ 873696584U, // <0,4,u,u>: Cost 1 vtrnl LHS, RHS
+ 2085683201U, // <0,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 2131476480U, // <0,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+ 2085699585U, // <0,5,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 2085707777U, // <0,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 3159457793U, // <0,5,0,4>: Cost 3 ins <0,u,0,4>, lane 1
+ 1678778497U, // <0,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3>
+ 3159474177U, // <0,5,0,6>: Cost 3 ins <0,u,0,6>, lane 1
+ 2013269302U, // <0,5,0,7>: Cost 2 vtrnr <0,0,0,0>, RHS
+ 2085699585U, // <0,5,0,u>: Cost 2 ins <0,u,0,2>, lane 1
+ 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+ 2085765121U, // <0,5,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 3159515137U, // <0,5,1,2>: Cost 3 ins <0,u,1,2>, lane 1
+ 2085781505U, // <0,5,1,3>: Cost 2 ins <0,u,1,3>, lane 1
+ 1812778950U, // <0,5,1,4>: Cost 2 vzipl LHS, <5,4,7,6>
+ 2085797889U, // <0,5,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 1812779106U, // <0,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0>
+ 2013351222U, // <0,5,1,7>: Cost 2 vtrnr <0,0,1,1>, RHS
+ 2085765121U, // <0,5,1,u>: Cost 2 ins <0,u,1,1>, lane 1
+ 2085830657U, // <0,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+ 1946996864U, // <0,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3>
+ 2085847041U, // <0,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <0,5,2,3>: Cost 1 ins LHS, lane 1
+ 2085863425U, // <0,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+ 1946996740U, // <0,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5>
+ 2085879809U, // <0,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2019478838U, // <0,5,2,7>: Cost 2 vtrnr <1,0,3,2>, RHS
+ 1012113409U, // <0,5,2,u>: Cost 1 ins LHS, lane 1
+ 2637858966U, // <0,5,3,0>: Cost 3 vext2 <3,4,0,5>, <3,0,1,2>
+ 3205439488U, // <0,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0
+ 3087183153U, // <0,5,3,2>: Cost 3 vtrnr <0,0,2,3>, <4,5,6,2>
+ 2085928961U, // <0,5,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+ 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+ 3205472256U, // <0,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0
+ 3205480448U, // <0,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0
+ 2131746816U, // <0,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+ 2131746816U, // <0,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+ 2888453704U, // <0,5,4,0>: Cost 3 vzipl <0,4,1,5>, <5,0,1,2>
+ 3159728129U, // <0,5,4,1>: Cost 3 ins <0,u,4,1>, lane 1
+ 3159736321U, // <0,5,4,2>: Cost 3 ins <0,u,4,2>, lane 1
+ 2086002689U, // <0,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 2888454068U, // <0,5,4,4>: Cost 3 vzipl <0,4,1,5>, <5,4,5,6>
+ 2131804160U, // <0,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+ 2086027265U, // <0,5,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+ 2131820544U, // <0,5,4,7>: Cost 2 ins <u,5,4,7>, lane 0
+ 2086027265U, // <0,5,4,u>: Cost 2 ins <0,u,4,6>, lane 1
+ 3205578752U, // <0,5,5,0>: Cost 3 ins <u,5,5,0>, lane 0
+ 2997291922U, // <0,5,5,1>: Cost 3 vzipr <7,4,0,5>, <4,0,5,1>
+ 2752523939U, // <0,5,5,2>: Cost 3 vuzpl <0,1,5,3>, <5,1,2,3>
+ 2086076417U, // <0,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 3205611520U, // <0,5,5,4>: Cost 3 ins <u,5,5,4>, lane 0
+ 2131877888U, // <0,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+ 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+ 2131894272U, // <0,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+ 2086076417U, // <0,5,5,u>: Cost 2 ins <0,u,5,3>, lane 1
+ 2131910656U, // <0,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <0,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <0,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <0,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <0,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <0,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <0,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <0,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <0,5,6,u>: Cost 1 ins RHS, lane 0
+ 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+ 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+ 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+ 2086223873U, // <0,5,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+ 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+ 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+ 2132041728U, // <0,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+ 2132041728U, // <0,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+ 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+ 2085765121U, // <0,5,u,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 2085699585U, // <0,5,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 1012113409U, // <0,5,u,3>: Cost 1 ins LHS, lane 1
+ 1817423814U, // <0,5,u,4>: Cost 2 vzipl LHS, <5,4,7,6>
+ 2085797889U, // <0,5,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 2085879809U, // <0,5,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 1058226176U, // <0,5,u,7>: Cost 1 ins RHS, lane 0
+ 1012113409U, // <0,5,u,u>: Cost 1 ins LHS, lane 1
+ 2085683201U, // <0,6,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 2085691393U, // <0,6,0,1>: Cost 2 ins <0,u,0,1>, lane 1
+ 2132148224U, // <0,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 2085707777U, // <0,6,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+ 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+ 1678852234U, // <0,6,0,6>: Cost 2 vuzpl <0,1,6,3>, <0,1,6,3>
+ 1879051574U, // <0,6,0,7>: Cost 2 vzipr <0,0,0,0>, RHS
+ 2132148224U, // <0,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0
+ 2993278336U, // <0,6,1,0>: Cost 3 vzipr <6,7,0,1>, <4,6,6,0>
+ 2085765121U, // <0,6,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 1812779514U, // <0,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3>
+ 2085781505U, // <0,6,1,3>: Cost 2 ins <0,u,1,3>, lane 1
+ 3159531521U, // <0,6,1,4>: Cost 3 ins <0,u,1,4>, lane 1
+ 2085797889U, // <0,6,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 1812779832U, // <0,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6>
+ 1892994358U, // <0,6,1,7>: Cost 2 vzipr <2,3,0,1>, RHS
+ 1892994359U, // <0,6,1,u>: Cost 2 vzipr <2,3,0,1>, RHS
+ 1946997582U, // <0,6,2,0>: Cost 2 vtrnl LHS, <6,7,0,1>
+ 2085838849U, // <0,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+ 2085847041U, // <0,6,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <0,6,2,3>: Cost 1 ins LHS, lane 1
+ 1946997622U, // <0,6,2,4>: Cost 2 vtrnl LHS, <6,7,4,5>
+ 2085871617U, // <0,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+ 2085879809U, // <0,6,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 1880395062U, // <0,6,2,7>: Cost 2 vzipr <0,2,0,2>, RHS
+ 1012113409U, // <0,6,2,u>: Cost 1 ins LHS, lane 1
+ 3122942050U, // <0,6,3,0>: Cost 3 vtrnr <6,0,1,3>, <5,6,7,0>
+ 2250527010U, // <0,6,3,1>: Cost 3 vrev <6,0,1,3>
+ 3206111232U, // <0,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+ 2085928961U, // <0,6,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+ 3206127616U, // <0,6,3,4>: Cost 3 ins <u,6,3,4>, lane 0
+ 3206135808U, // <0,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+ 3206144000U, // <0,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0
+ 2132410368U, // <0,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+ 2132410368U, // <0,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0
+ 2888536380U, // <0,6,4,0>: Cost 3 vzipl <0,4,2,6>, <6,0,4,2>
+ 3021574433U, // <0,6,4,1>: Cost 3 vtrnl <0,2,4,6>, <6,0,1,2>
+ 3021574444U, // <0,6,4,2>: Cost 3 vtrnl <0,2,4,6>, <6,0,2,4>
+ 2086002689U, // <0,6,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+ 2086019073U, // <0,6,4,5>: Cost 2 ins <0,u,4,5>, lane 1
+ 2132475904U, // <0,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 2954153270U, // <0,6,4,7>: Cost 3 vzipr <0,2,0,4>, RHS
+ 2132475904U, // <0,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0
+ 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+ 3206250496U, // <0,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+ 3206258688U, // <0,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+ 2086076417U, // <0,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 3206275072U, // <0,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0
+ 3206283264U, // <0,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+ 3206291456U, // <0,6,5,6>: Cost 3 ins <u,6,5,6>, lane 0
+ 2961460534U, // <0,6,5,7>: Cost 3 vzipr <1,4,0,5>, RHS
+ 2086076417U, // <0,6,5,u>: Cost 2 ins <0,u,5,3>, lane 1
+ 2724172540U, // <0,6,6,0>: Cost 3 vext3 <6,6,0,0>, <6,6,0,0>
+ 2889838972U, // <0,6,6,1>: Cost 3 vzipl <0,6,2,3>, <6,1,2,3>
+ 2997300124U, // <0,6,6,2>: Cost 3 vzipr <7,4,0,6>, <4,0,6,2>
+ 2086150145U, // <0,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 3206348800U, // <0,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0
+ 2889839336U, // <0,6,6,5>: Cost 3 vzipl <0,6,2,3>, <6,5,6,7>
+ 2132623360U, // <0,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+ 2132631552U, // <0,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0
+ 2086150145U, // <0,6,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+ 2132647936U, // <0,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+ 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+ 3206406144U, // <0,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0
+ 2086223873U, // <0,6,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 2132680704U, // <0,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+ 3206438912U, // <0,6,7,6>: Cost 3 ins <u,6,7,6>, lane 0
+ 2132705280U, // <0,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+ 2132647936U, // <0,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0
+ 2132647936U, // <0,6,u,0>: Cost 2 ins <u,6,7,0>, lane 0
+ 2085765121U, // <0,6,u,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 2132148224U, // <0,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 1012113409U, // <0,6,u,3>: Cost 1 ins LHS, lane 1
+ 2132680704U, // <0,6,u,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 2085797889U, // <0,6,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 2085879809U, // <0,6,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 1880444214U, // <0,6,u,7>: Cost 2 vzipr <0,2,0,u>, RHS
+ 1012113409U, // <0,6,u,u>: Cost 1 ins LHS, lane 1
+ 2085683201U, // <0,7,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 2132803584U, // <0,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+ 2085699585U, // <0,7,0,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 2085707777U, // <0,7,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 2580516150U, // <0,7,0,4>: Cost 3 vext1 <5,0,7,0>, RHS
+ 2580516476U, // <0,7,0,5>: Cost 3 vext1 <5,0,7,0>, <5,0,7,0>
+ 2586489173U, // <0,7,0,6>: Cost 3 vext1 <6,0,7,0>, <6,0,7,0>
+ 1678925971U, // <0,7,0,7>: Cost 2 vuzpl <0,1,7,3>, <0,1,7,3>
+ 2132803584U, // <0,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+ 1812780026U, // <0,7,1,0>: Cost 2 vzipl LHS, <7,0,1,2>
+ 2085765121U, // <0,7,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+ 2132893696U, // <0,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+ 1812780390U, // <0,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6>
+ 2085797889U, // <0,7,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 2586497366U, // <0,7,1,6>: Cost 3 vext1 <6,0,7,1>, <6,0,7,1>
+ 1812780652U, // <0,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7>
+ 2085765121U, // <0,7,1,u>: Cost 2 ins <0,u,1,1>, lane 1
+ 2085830657U, // <0,7,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+ 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+ 2085847041U, // <0,7,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <0,7,2,3>: Cost 1 ins LHS, lane 1
+ 2085863425U, // <0,7,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+ 1946998118U, // <0,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6>
+ 2085879809U, // <0,7,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 1946998380U, // <0,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7>
+ 1012113409U, // <0,7,2,u>: Cost 1 ins LHS, lane 1
+ 2989314146U, // <0,7,3,0>: Cost 3 vzipr <6,1,0,3>, <5,6,7,0>
+ 3206766592U, // <0,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0
+ 3020813397U, // <0,7,3,2>: Cost 3 vtrnl <0,1,3,3>, <7,1,2,3>
+ 2085928961U, // <0,7,3,3>: Cost 2 ins <0,u,3,3>, lane 1
+ 3206791168U, // <0,7,3,4>: Cost 3 ins <u,7,3,4>, lane 0
+ 3206799360U, // <0,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0
+ 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+ 3206815744U, // <0,7,3,7>: Cost 3 ins <u,7,3,7>, lane 0
+ 2085928961U, // <0,7,3,u>: Cost 2 ins <0,u,3,3>, lane 1
+ 3206832128U, // <0,7,4,0>: Cost 3 ins <u,7,4,0>, lane 0
+ 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+ 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+ 2086002689U, // <0,7,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 3206864896U, // <0,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0
+ 2133131264U, // <0,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+ 2086027265U, // <0,7,4,6>: Cost 2 ins <0,u,4,6>, lane 1
+ 3020887660U, // <0,7,4,7>: Cost 3 vtrnl <0,1,4,3>, <7,7,7,7>
+ 2133131264U, // <0,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+ 2993311842U, // <0,7,5,0>: Cost 3 vzipr <6,7,0,5>, <5,6,7,0>
+ 3206914048U, // <0,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0
+ 3020960853U, // <0,7,5,2>: Cost 3 vtrnl <0,1,5,3>, <7,1,2,3>
+ 2086076417U, // <0,7,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+ 3206946816U, // <0,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0
+ 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+ 2133221376U, // <0,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+ 2133221376U, // <0,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+ 2854834274U, // <0,7,6,0>: Cost 3 vuzpr <6,0,5,7>, <5,6,7,0>
+ 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+ 3206995968U, // <0,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0
+ 2086150145U, // <0,7,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 3207012352U, // <0,7,6,4>: Cost 3 ins <u,7,6,4>, lane 0
+ 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+ 3207028736U, // <0,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0
+ 2133295104U, // <0,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+ 2086150145U, // <0,7,6,u>: Cost 2 ins <0,u,6,3>, lane 1
+ 2992001122U, // <0,7,7,0>: Cost 3 vzipr <6,5,0,7>, <5,6,7,0>
+ 3207061504U, // <0,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0
+ 2752672853U, // <0,7,7,2>: Cost 3 vuzpl <0,1,7,3>, <7,1,2,3>
+ 2086223873U, // <0,7,7,3>: Cost 2 ins <0,u,7,3>, lane 1
+ 3207086080U, // <0,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0
+ 3207094272U, // <0,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0
+ 2663093724U, // <0,7,7,6>: Cost 3 vext2 <7,6,0,7>, <7,6,0,7>
+ 2133368832U, // <0,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+ 2086223873U, // <0,7,7,u>: Cost 2 ins <0,u,7,3>, lane 1
+ 1817424890U, // <0,7,u,0>: Cost 2 vzipl LHS, <7,0,1,2>
+ 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+ 2085699585U, // <0,7,u,2>: Cost 2 ins <0,u,0,2>, lane 1
+ 1012113409U, // <0,7,u,3>: Cost 1 ins LHS, lane 1
+ 1817425254U, // <0,7,u,4>: Cost 2 vzipl LHS, <7,4,5,6>
+ 2085797889U, // <0,7,u,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 2085879809U, // <0,7,u,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2133221376U, // <0,7,u,7>: Cost 2 ins <u,7,5,7>, lane 0
+ 1012113409U, // <0,7,u,u>: Cost 1 ins LHS, lane 1
+ 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+ 1007951877U, // <0,u,0,1>: Cost 1 ins LHS, lane 5
+ 605257830U, // <0,u,0,2>: Cost 1 vuzpl LHS, LHS
+ 1007910914U, // <0,u,0,3>: Cost 1 ins LHS, lane 2
+ 1678999756U, // <0,u,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+ 2081767427U, // <0,u,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+ 1947506842U, // <0,u,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS
+ 2081767427U, // <0,u,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+ 605257884U, // <0,u,0,u>: Cost 1 vuzpl LHS, LHS
+ 1812821715U, // <0,u,1,0>: Cost 2 vzipl LHS, <u,0,1,2>
+ 739039022U, // <0,u,1,1>: Cost 1 vzipl LHS, LHS
+ 1813264264U, // <0,u,1,2>: Cost 2 vzipl LHS, <u,2,3,3>
+ 1007910914U, // <0,u,1,3>: Cost 1 ins LHS, lane 2
+ 1812822079U, // <0,u,1,4>: Cost 2 vzipl LHS, <u,4,5,6>
+ 739039386U, // <0,u,1,5>: Cost 1 vzipl LHS, RHS
+ 1813264592U, // <0,u,1,6>: Cost 2 vzipl LHS, <u,6,3,7>
+ 1892994376U, // <0,u,1,7>: Cost 2 vzipr <2,3,0,1>, RHS
+ 739039589U, // <0,u,1,u>: Cost 1 vzipl LHS, LHS
+ 1007509507U, // <0,u,2,0>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,u,2,1>: Cost 1 ins LHS, lane 3
+ 873256750U, // <0,u,2,2>: Cost 1 vtrnl LHS, LHS
+ 835584U, // <0,u,2,3>: Cost 0 copy LHS
+ 1007509507U, // <0,u,2,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <0,u,2,5>: Cost 1 ins LHS, lane 3
+ 873257114U, // <0,u,2,6>: Cost 1 vtrnl LHS, RHS
+ 1007509507U, // <0,u,2,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,u,2,u>: Cost 0 copy LHS
+ 2133680132U, // <0,u,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+ 1679001750U, // <0,u,3,1>: Cost 2 vuzpl LHS, <3,0,1,2>
+ 2128388096U, // <0,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 1007910914U, // <0,u,3,3>: Cost 1 ins LHS, lane 2
+ 2133712900U, // <0,u,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+ 1679002114U, // <0,u,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+ 2082340866U, // <0,u,3,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 2133737476U, // <0,u,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+ 1007910914U, // <0,u,3,u>: Cost 1 ins LHS, lane 2
+ 2082062339U, // <0,u,4,0>: Cost 2 ins <0,2,4,u>, lane 3
+ 1814714158U, // <0,u,4,1>: Cost 2 vzipl <0,4,1,5>, LHS
+ 1947834158U, // <0,u,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS
+ 1007910914U, // <0,u,4,3>: Cost 1 ins LHS, lane 2
+ 1947828428U, // <0,u,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1007951877U, // <0,u,4,5>: Cost 1 ins LHS, lane 5
+ 605261110U, // <0,u,4,6>: Cost 1 vuzpl LHS, RHS
+ 2082062339U, // <0,u,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+ 605261128U, // <0,u,4,u>: Cost 1 vuzpl LHS, RHS
+ 2080964610U, // <0,u,5,0>: Cost 2 ins <0,0,u,0>, lane 2
+ 2128527360U, // <0,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 2080980994U, // <0,u,5,2>: Cost 2 ins <0,0,u,2>, lane 2
+ 1007910914U, // <0,u,5,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <0,u,5,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 2133868548U, // <0,u,5,5>: Cost 2 ins <u,u,5,5>, lane 4
+ 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+ 1751092534U, // <0,u,5,7>: Cost 2 vuzpr <1,0,3,u>, RHS
+ 1007910914U, // <0,u,5,u>: Cost 1 ins LHS, lane 2
+ 1679004494U, // <0,u,6,0>: Cost 2 vuzpl LHS, <6,7,0,1>
+ 2080972802U, // <0,u,6,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 2128609280U, // <0,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 1007910914U, // <0,u,6,3>: Cost 1 ins LHS, lane 2
+ 1679004534U, // <0,u,6,4>: Cost 2 vuzpl LHS, <6,7,4,5>
+ 2083659778U, // <0,u,6,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 2133950468U, // <0,u,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+ 1060216836U, // <0,u,6,7>: Cost 1 ins RHS, lane 4
+ 1007910914U, // <0,u,6,u>: Cost 1 ins LHS, lane 2
+ 2133975044U, // <0,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+ 2080972802U, // <0,u,7,1>: Cost 2 ins <0,0,u,1>, lane 2
+ 2080980994U, // <0,u,7,2>: Cost 2 ins <0,0,u,2>, lane 2
+ 1007910914U, // <0,u,7,3>: Cost 1 ins LHS, lane 2
+ 2134007812U, // <0,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+ 2083659778U, // <0,u,7,5>: Cost 2 ins <0,4,u,5>, lane 2
+ 2134024196U, // <0,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 2134032388U, // <0,u,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+ 1007910914U, // <0,u,7,u>: Cost 1 ins LHS, lane 2
+ 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+ 743683886U, // <0,u,u,1>: Cost 1 vzipl LHS, LHS
+ 605263662U, // <0,u,u,2>: Cost 1 vuzpl LHS, LHS
+ 835584U, // <0,u,u,3>: Cost 0 copy LHS
+ 1007509507U, // <0,u,u,4>: Cost 1 ins LHS, lane 3
+ 743684250U, // <0,u,u,5>: Cost 1 vzipl LHS, RHS
+ 605264026U, // <0,u,u,6>: Cost 1 vuzpl LHS, RHS
+ 1007509507U, // <0,u,u,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <0,u,u,u>: Cost 0 copy LHS
+ 2128150528U, // <1,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+ 1818148966U, // <1,0,0,1>: Cost 2 vzipl <1,0,3,2>, LHS
+ 2086952962U, // <1,0,0,2>: Cost 2 ins <1,0,u,2>, lane 2
+ 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+ 2891891026U, // <1,0,0,4>: Cost 3 vzipl <1,0,3,2>, <0,4,1,5>
+ 3165437953U, // <1,0,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+ 3160154115U, // <1,0,0,6>: Cost 3 ins <1,0,0,u>, lane 3
+ 3160154115U, // <1,0,0,7>: Cost 3 ins <1,0,0,u>, lane 3
+ 1818149533U, // <1,0,0,u>: Cost 2 vzipl <1,0,3,2>, LHS
+ 1141522514U, // <1,0,1,0>: Cost 2 vrev <0,1,0,1>
+ 1818656870U, // <1,0,1,1>: Cost 2 vzipl <1,1,1,1>, LHS
+ 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2091753473U, // <1,0,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 1477070134U, // <1,0,1,4>: Cost 2 vext1 <0,1,0,1>, RHS
+ 2760770560U, // <1,0,1,5>: Cost 3 vuzpl <1,5,0,2>, <1,3,5,7>
+ 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+ 3165528065U, // <1,0,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+ 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+ 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+ 1819459686U, // <1,0,2,1>: Cost 2 vzipl <1,2,3,0>, LHS
+ 2128314368U, // <1,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2087002117U, // <1,0,2,3>: Cost 2 ins <1,0,u,u>, lane 5
+ 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+ 2970798548U, // <1,0,2,5>: Cost 3 vzipr <3,0,1,2>, <3,4,0,5>
+ 3165593601U, // <1,0,2,6>: Cost 3 ins <1,u,2,6>, lane 1
+ 2592625730U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,1,0,2>
+ 1819460253U, // <1,0,2,u>: Cost 2 vzipl <1,2,3,0>, LHS
+ 2014101504U, // <1,0,3,0>: Cost 2 vtrnr LHS, <0,0,0,0>
+ 2014101514U, // <1,0,3,1>: Cost 2 vtrnr LHS, <0,0,1,1>
+ 67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+ 2091900929U, // <1,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2091909121U, // <1,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 2086633475U, // <1,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3
+ 2086633475U, // <1,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3
+ 2091933697U, // <1,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+ 2667752338U, // <1,0,4,0>: Cost 3 vext2 <u,4,1,0>, <4,0,5,1>
+ 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+ 2086952962U, // <1,0,4,2>: Cost 2 ins <1,0,u,2>, lane 2
+ 2819383641U, // <1,0,4,3>: Cost 3 vuzpr <0,1,2,0>, <0,4,2,3>
+ 2894569810U, // <1,0,4,4>: Cost 3 vzipl <1,4,3,5>, <0,4,1,5>
+ 2087002117U, // <1,0,4,5>: Cost 2 ins <1,0,u,u>, lane 5
+ 2758102326U, // <1,0,4,6>: Cost 3 vuzpl <1,1,0,0>, RHS
+ 2819386597U, // <1,0,4,7>: Cost 3 vuzpr <0,1,2,0>, <4,4,6,7>
+ 2086952962U, // <1,0,4,u>: Cost 2 ins <1,0,u,2>, lane 2
+ 2955558912U, // <1,0,5,0>: Cost 3 vzipr <0,4,1,5>, <0,0,0,0>
+ 1821507686U, // <1,0,5,1>: Cost 2 vzipl <1,5,3,7>, LHS
+ 1954545766U, // <1,0,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS
+ 3165790209U, // <1,0,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+ 1141850234U, // <1,0,5,4>: Cost 2 vrev <0,1,4,5>
+ 3165806593U, // <1,0,5,5>: Cost 3 ins <1,u,5,5>, lane 1
+ 3202310144U, // <1,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+ 2092081153U, // <1,0,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+ 1954545820U, // <1,0,5,u>: Cost 2 vtrnl <1,3,5,7>, LHS
+ 3202334720U, // <1,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0
+ 2895765606U, // <1,0,6,1>: Cost 3 vzipl <1,6,1,7>, LHS
+ 2128609280U, // <1,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 2819383803U, // <1,0,6,3>: Cost 3 vuzpr <0,1,2,0>, <0,6,2,3>
+ 2896060754U, // <1,0,6,4>: Cost 3 vzipl <1,6,5,7>, <0,4,1,5>
+ 2215673988U, // <1,0,6,5>: Cost 3 vrev <0,1,5,6>
+ 3165888513U, // <1,0,6,6>: Cost 3 ins <1,u,6,6>, lane 1
+ 2087002117U, // <1,0,6,7>: Cost 2 ins <1,0,u,u>, lane 5
+ 2128609280U, // <1,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+ 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+ 2974156454U, // <1,0,7,1>: Cost 3 vzipr <3,5,1,7>, <2,3,0,1>
+ 2086952962U, // <1,0,7,2>: Cost 2 ins <1,0,u,2>, lane 2
+ 2861265024U, // <1,0,7,3>: Cost 3 vuzpr <7,1,3,0>, <5,7,1,3>
+ 3202441216U, // <1,0,7,4>: Cost 3 ins <u,0,7,4>, lane 0
+ 3165954049U, // <1,0,7,5>: Cost 3 ins <1,u,7,5>, lane 1
+ 1142014094U, // <1,0,7,6>: Cost 2 vrev <0,1,6,7>
+ 3165970433U, // <1,0,7,7>: Cost 3 ins <1,u,7,7>, lane 1
+ 2086952962U, // <1,0,7,u>: Cost 2 ins <1,0,u,2>, lane 2
+ 2014142464U, // <1,0,u,0>: Cost 2 vtrnr LHS, <0,0,0,0>
+ 2014142474U, // <1,0,u,1>: Cost 2 vtrnr LHS, <0,0,1,1>
+ 67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+ 2091753473U, // <1,0,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 2091909121U, // <1,0,u,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 2086633475U, // <1,0,u,5>: Cost 2 ins <1,0,3,u>, lane 3
+ 2086633475U, // <1,0,u,6>: Cost 2 ins <1,0,3,u>, lane 3
+ 2091933697U, // <1,0,u,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+ 1818149622U, // <1,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+ 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+ 1684439142U, // <1,1,0,2>: Cost 2 vuzpl <1,1,1,1>, LHS
+ 2087624706U, // <1,1,0,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+ 2891891856U, // <1,1,0,5>: Cost 3 vzipl <1,0,3,2>, <1,5,3,7>
+ 3161391106U, // <1,1,0,6>: Cost 3 ins <1,1,u,6>, lane 2
+ 3161399298U, // <1,1,0,7>: Cost 3 ins <1,1,u,7>, lane 2
+ 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+ 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+ 2087149571U, // <1,1,1,2>: Cost 2 ins <1,1,1,u>, lane 3
+ 1751548006U, // <1,1,1,3>: Cost 2 vuzpr <1,1,1,1>, LHS
+ 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 2087149571U, // <1,1,1,5>: Cost 2 ins <1,1,1,u>, lane 3
+ 2087149571U, // <1,1,1,6>: Cost 2 ins <1,1,1,u>, lane 3
+ 2087149571U, // <1,1,1,7>: Cost 2 ins <1,1,1,u>, lane 3
+ 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+ 2128961536U, // <1,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <1,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 1819460502U, // <1,1,2,2>: Cost 2 vzipl <1,2,3,0>, <1,2,3,0>
+ 1055244288U, // <1,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <1,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <1,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <1,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <1,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <1,1,2,u>: Cost 1 ins LHS, lane 0
+ 2091876353U, // <1,1,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 2014102324U, // <1,1,3,1>: Cost 2 vtrnr LHS, <1,1,1,1>
+ 2091892737U, // <1,1,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 940359782U, // <1,1,3,3>: Cost 1 vtrnr LHS, LHS
+ 2091909121U, // <1,1,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 2087297027U, // <1,1,3,5>: Cost 2 ins <1,1,3,u>, lane 3
+ 2087297027U, // <1,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3
+ 2091933697U, // <1,1,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 940359787U, // <1,1,3,u>: Cost 1 vtrnr LHS, LHS
+ 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+ 2087608322U, // <1,1,4,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 2894496662U, // <1,1,4,2>: Cost 3 vzipl <1,4,2,5>, <1,2,3,0>
+ 2087624706U, // <1,1,4,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 2014109799U, // <1,1,4,4>: Cost 2 vtrnr <0,1,2,4>, <0,1,2,4>
+ 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 1684442422U, // <1,1,4,6>: Cost 2 vuzpl <1,1,1,1>, RHS
+ 3161399298U, // <1,1,4,7>: Cost 3 ins <1,1,u,7>, lane 2
+ 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+ 3028288624U, // <1,1,5,0>: Cost 3 vtrnl <1,3,5,7>, <1,5,0,2>
+ 2087608322U, // <1,1,5,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 2955561110U, // <1,1,5,2>: Cost 3 vzipr <0,4,1,5>, <3,0,1,2>
+ 2087624706U, // <1,1,5,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 2955558925U, // <1,1,5,4>: Cost 3 vzipr <0,4,1,5>, <0,0,1,4>
+ 1881817426U, // <1,1,5,5>: Cost 2 vzipr <0,4,1,5>, <0,4,1,5>
+ 2670415970U, // <1,1,5,6>: Cost 3 vext2 <u,u,1,1>, <5,6,7,0>
+ 1751551286U, // <1,1,5,7>: Cost 2 vuzpr <1,1,1,1>, RHS
+ 1751551287U, // <1,1,5,u>: Cost 2 vuzpr <1,1,1,1>, RHS
+ 3165839361U, // <1,1,6,0>: Cost 3 ins <1,u,6,0>, lane 1
+ 2087608322U, // <1,1,6,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 2973485206U, // <1,1,6,2>: Cost 3 vzipr <3,4,1,6>, <3,0,1,2>
+ 2087624706U, // <1,1,6,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 2221572948U, // <1,1,6,4>: Cost 3 vrev <1,1,4,6>
+ 2955567442U, // <1,1,6,5>: Cost 3 vzipr <0,4,1,6>, <0,4,1,5>
+ 2014126185U, // <1,1,6,6>: Cost 2 vtrnr <0,1,2,6>, <0,1,2,6>
+ 2087665669U, // <1,1,6,7>: Cost 2 ins <1,1,u,u>, lane 5
+ 2087624706U, // <1,1,6,u>: Cost 2 ins <1,1,u,3>, lane 2
+ 2670416890U, // <1,1,7,0>: Cost 3 vext2 <u,u,1,1>, <7,0,1,2>
+ 2087608322U, // <1,1,7,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 3203088384U, // <1,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0
+ 2129354752U, // <1,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+ 2670417254U, // <1,1,7,4>: Cost 3 vext2 <u,u,1,1>, <7,4,5,6>
+ 2221654878U, // <1,1,7,5>: Cost 3 vrev <1,1,5,7>
+ 3161391106U, // <1,1,7,6>: Cost 3 ins <1,1,u,6>, lane 2
+ 2014134378U, // <1,1,7,7>: Cost 2 vtrnr <0,1,2,7>, <0,1,2,7>
+ 2129354752U, // <1,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+ 1818149622U, // <1,1,u,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+ 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+ 1684444974U, // <1,1,u,2>: Cost 2 vuzpl <1,1,1,1>, LHS
+ 940400742U, // <1,1,u,3>: Cost 1 vtrnr LHS, LHS
+ 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+ 1684445338U, // <1,1,u,6>: Cost 2 vuzpl <1,1,1,1>, RHS
+ 1751551529U, // <1,1,u,7>: Cost 2 vuzpr <1,1,1,1>, RHS
+ 940400747U, // <1,1,u,u>: Cost 1 vtrnr LHS, LHS
+ 2088263682U, // <1,2,0,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2129494016U, // <1,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 2954854502U, // <1,2,0,3>: Cost 3 vzipr <0,3,1,0>, LHS
+ 2088296450U, // <1,2,0,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 3165437953U, // <1,2,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+ 2891892666U, // <1,2,0,6>: Cost 3 vzipl <1,0,3,2>, <2,6,3,7>
+ 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+ 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+ 2088263682U, // <1,2,1,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 2091737089U, // <1,2,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 1745657957U, // <1,2,1,2>: Cost 2 vuzpr <0,1,2,2>, <0,1,2,2>
+ 1884438630U, // <1,2,1,3>: Cost 2 vzipr <0,u,1,1>, LHS
+ 2088296450U, // <1,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+ 2958180700U, // <1,2,1,6>: Cost 3 vzipr <0,u,1,1>, <0,4,2,6>
+ 3165528065U, // <1,2,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+ 1884438635U, // <1,2,1,u>: Cost 2 vzipr <0,u,1,1>, LHS
+ 2088263682U, // <1,2,2,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 2893235754U, // <1,2,2,1>: Cost 3 vzipl <1,2,3,4>, <2,1,4,3>
+ 2129641472U, // <1,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+ 1897054310U, // <1,2,2,3>: Cost 2 vzipr <3,0,1,2>, LHS
+ 2088296450U, // <1,2,2,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 3165585409U, // <1,2,2,5>: Cost 3 ins <1,u,2,5>, lane 1
+ 2893203386U, // <1,2,2,6>: Cost 3 vzipl <1,2,3,0>, <2,6,3,7>
+ 2994684010U, // <1,2,2,7>: Cost 3 vzipr <7,0,1,2>, <0,1,2,7>
+ 1897054315U, // <1,2,2,u>: Cost 2 vzipr <3,0,1,2>, LHS
+ 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+ 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+ 2014101708U, // <1,2,3,6>: Cost 2 vtrnr LHS, <0,2,4,6>
+ 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+ 2088263682U, // <1,2,4,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 3162013698U, // <1,2,4,1>: Cost 3 ins <1,2,u,1>, lane 2
+ 3162021890U, // <1,2,4,2>: Cost 3 ins <1,2,u,2>, lane 2
+ 2954887270U, // <1,2,4,3>: Cost 3 vzipr <0,3,1,4>, LHS
+ 2088296450U, // <1,2,4,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2129821696U, // <1,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+ 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2088263682U, // <1,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+ 2955558932U, // <1,2,5,2>: Cost 3 vzipr <0,4,1,5>, <0,0,2,2>
+ 1881817190U, // <1,2,5,3>: Cost 2 vzipr <0,4,1,5>, LHS
+ 2088296450U, // <1,2,5,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+ 2955559260U, // <1,2,5,6>: Cost 3 vzipr <0,4,1,5>, <0,4,2,6>
+ 2092081153U, // <1,2,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+ 1881817195U, // <1,2,5,u>: Cost 2 vzipr <0,4,1,5>, LHS
+ 2088263682U, // <1,2,6,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 3162013698U, // <1,2,6,1>: Cost 3 ins <1,2,u,1>, lane 2
+ 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+ 2954240102U, // <1,2,6,3>: Cost 3 vzipr <0,2,1,6>, LHS
+ 2088296450U, // <1,2,6,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 3162046466U, // <1,2,6,5>: Cost 3 ins <1,2,u,5>, lane 2
+ 2895914938U, // <1,2,6,6>: Cost 3 vzipl <1,6,3,7>, <2,6,3,7>
+ 2088329221U, // <1,2,6,7>: Cost 2 ins <1,2,u,u>, lane 5
+ 2088263682U, // <1,2,6,u>: Cost 2 ins <1,2,u,0>, lane 2
+ 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+ 3203743744U, // <1,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+ 3203751936U, // <1,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0
+ 2130018304U, // <1,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+ 2088296450U, // <1,2,7,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 3203776512U, // <1,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0
+ 3203784704U, // <1,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+ 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+ 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+ 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+ 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+ 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+ 2014142668U, // <1,2,u,6>: Cost 2 vtrnr LHS, <0,2,4,6>
+ 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+ 1745666048U, // <1,3,0,0>: Cost 2 vuzpr LHS, <0,0,0,0>
+ 1746108426U, // <1,3,0,1>: Cost 2 vuzpr LHS, <0,0,1,1>
+ 1745666806U, // <1,3,0,2>: Cost 2 vuzpr LHS, <1,0,3,2>
+ 2088951810U, // <1,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 2819850253U, // <1,3,0,4>: Cost 3 vuzpr LHS, <0,0,1,4>
+ 2758984055U, // <1,3,0,5>: Cost 3 vuzpl <1,2,3,0>, <0,4,5,6>
+ 2867183658U, // <1,3,0,6>: Cost 3 vuzpr LHS, <0,0,4,6>
+ 2088984578U, // <1,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1745668252U, // <1,3,0,u>: Cost 2 vuzpr LHS, <3,0,1,u>
+ 2088476675U, // <1,3,1,0>: Cost 2 ins <1,3,1,u>, lane 3
+ 1745666868U, // <1,3,1,1>: Cost 2 vuzpr LHS, <1,1,1,1>
+ 2088476675U, // <1,3,1,2>: Cost 2 ins <1,3,1,u>, lane 3
+ 671924326U, // <1,3,1,3>: Cost 1 vuzpr LHS, LHS
+ 2088476675U, // <1,3,1,4>: Cost 2 ins <1,3,1,u>, lane 3
+ 2088476675U, // <1,3,1,5>: Cost 2 ins <1,3,1,u>, lane 3
+ 2088476675U, // <1,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3
+ 2088984578U, // <1,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 671924331U, // <1,3,1,u>: Cost 1 vuzpr LHS, LHS
+ 1745666966U, // <1,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+ 2819408044U, // <1,3,2,1>: Cost 3 vuzpr LHS, <0,2,1,1>
+ 1745666212U, // <1,3,2,2>: Cost 2 vuzpr LHS, <0,2,0,2>
+ 1746110066U, // <1,3,2,3>: Cost 2 vuzpr LHS, <2,2,3,3>
+ 1745666970U, // <1,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+ 2819408066U, // <1,3,2,5>: Cost 3 vuzpr LHS, <0,2,3,5>
+ 1745666252U, // <1,3,2,6>: Cost 2 vuzpr LHS, <0,2,4,6>
+ 2088984578U, // <1,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1745666218U, // <1,3,2,u>: Cost 2 vuzpr LHS, <0,2,0,u>
+ 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+ 1745667750U, // <1,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1>
+ 2091892737U, // <1,3,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 1745667032U, // <1,3,3,3>: Cost 2 vuzpr LHS, <1,3,1,3>
+ 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+ 1745667790U, // <1,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5>
+ 2819408868U, // <1,3,3,6>: Cost 3 vuzpr LHS, <1,3,2,6>
+ 2014102528U, // <1,3,3,7>: Cost 2 vtrnr LHS, <1,3,5,7>
+ 1745667037U, // <1,3,3,u>: Cost 2 vuzpr LHS, <1,3,1,u>
+ 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+ 2759019375U, // <1,3,4,1>: Cost 3 vuzpl <1,2,3,4>, <4,0,1,2>
+ 2759019466U, // <1,3,4,2>: Cost 3 vuzpl <1,2,3,4>, <4,1,2,3>
+ 2088951810U, // <1,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 1793445072U, // <1,3,4,4>: Cost 2 vuzpr LHS, <4,4,4,4>
+ 1746108754U, // <1,3,4,5>: Cost 2 vuzpr LHS, <0,4,1,5>
+ 1745668610U, // <1,3,4,6>: Cost 2 vuzpr LHS, <3,4,5,6>
+ 2088984578U, // <1,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1745668612U, // <1,3,4,u>: Cost 2 vuzpr LHS, <3,4,5,u>
+ 2088771587U, // <1,3,5,0>: Cost 2 ins <1,3,5,u>, lane 3
+ 2088771587U, // <1,3,5,1>: Cost 2 ins <1,3,5,u>, lane 3
+ 2088771587U, // <1,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3
+ 2088951810U, // <1,3,5,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 2088771587U, // <1,3,5,4>: Cost 2 ins <1,3,5,u>, lane 3
+ 1793445892U, // <1,3,5,5>: Cost 2 vuzpr LHS, <5,5,5,5>
+ 2088771587U, // <1,3,5,6>: Cost 2 ins <1,3,5,u>, lane 3
+ 671927606U, // <1,3,5,7>: Cost 1 vuzpr LHS, RHS
+ 671927607U, // <1,3,5,u>: Cost 1 vuzpr LHS, RHS
+ 1793445986U, // <1,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+ 2867185561U, // <1,3,6,1>: Cost 3 vuzpr LHS, <2,6,0,1>
+ 1793445196U, // <1,3,6,2>: Cost 2 vuzpr LHS, <4,6,0,2>
+ 2088951810U, // <1,3,6,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 1793445990U, // <1,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+ 2849642738U, // <1,3,6,5>: Cost 3 vuzpr <5,1,7,3>, <u,6,7,5>
+ 1793445236U, // <1,3,6,6>: Cost 2 vuzpr LHS, <4,6,4,6>
+ 1746110394U, // <1,3,6,7>: Cost 2 vuzpr LHS, <2,6,3,7>
+ 1746110395U, // <1,3,6,u>: Cost 2 vuzpr LHS, <2,6,3,u>
+ 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+ 1793446734U, // <1,3,7,1>: Cost 2 vuzpr LHS, <6,7,0,1>
+ 2867187830U, // <1,3,7,2>: Cost 3 vuzpr LHS, <5,7,0,2>
+ 1793446016U, // <1,3,7,3>: Cost 2 vuzpr LHS, <5,7,1,3>
+ 2849637679U, // <1,3,7,4>: Cost 3 vuzpr <5,1,7,3>, <1,7,3,4>
+ 1793446774U, // <1,3,7,5>: Cost 2 vuzpr LHS, <6,7,4,5>
+ 2867185674U, // <1,3,7,6>: Cost 3 vuzpr LHS, <2,7,3,6>
+ 1793446056U, // <1,3,7,7>: Cost 2 vuzpr LHS, <5,7,5,7>
+ 1793446021U, // <1,3,7,u>: Cost 2 vuzpr LHS, <5,7,1,u>
+ 1746109820U, // <1,3,u,0>: Cost 2 vuzpr LHS, <1,u,3,0>
+ 2014144166U, // <1,3,u,1>: Cost 2 vtrnr LHS, <2,3,0,1>
+ 1745668894U, // <1,3,u,2>: Cost 2 vuzpr LHS, <3,u,1,2>
+ 671924893U, // <1,3,u,3>: Cost 1 vuzpr LHS, LHS
+ 1746109824U, // <1,3,u,4>: Cost 2 vuzpr LHS, <1,u,3,4>
+ 2014144206U, // <1,3,u,5>: Cost 2 vtrnr LHS, <2,3,4,5>
+ 1745668934U, // <1,3,u,6>: Cost 2 vuzpr LHS, <3,u,5,6>
+ 671927849U, // <1,3,u,7>: Cost 1 vuzpr LHS, RHS
+ 671924898U, // <1,3,u,u>: Cost 1 vuzpr LHS, LHS
+ 3165396993U, // <1,4,0,0>: Cost 3 ins <1,u,0,0>, lane 1
+ 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+ 2758434918U, // <1,4,0,2>: Cost 3 vuzpl <1,1,4,5>, LHS
+ 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+ 3165429761U, // <1,4,0,4>: Cost 3 ins <1,u,0,4>, lane 1
+ 1818152246U, // <1,4,0,5>: Cost 2 vzipl <1,0,3,2>, RHS
+ 3026537782U, // <1,4,0,6>: Cost 3 vtrnl <1,1,0,0>, RHS
+ 3162808323U, // <1,4,0,7>: Cost 3 ins <1,4,0,u>, lane 3
+ 1818152489U, // <1,4,0,u>: Cost 2 vzipl <1,0,3,2>, RHS
+ 3204620288U, // <1,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+ 2091737089U, // <1,4,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 3204636672U, // <1,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+ 2091753473U, // <1,4,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 1745674343U, // <1,4,1,4>: Cost 2 vuzpr <0,1,2,4>, <0,1,2,4>
+ 1818660150U, // <1,4,1,5>: Cost 2 vzipl <1,1,1,1>, RHS
+ 1952877878U, // <1,4,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS
+ 3165528065U, // <1,4,1,7>: Cost 3 ins <1,u,1,7>, lane 1
+ 1818660393U, // <1,4,1,u>: Cost 2 vzipl <1,1,1,1>, RHS
+ 2893237103U, // <1,4,2,0>: Cost 3 vzipl <1,2,3,4>, <4,0,1,2>
+ 2893237194U, // <1,4,2,1>: Cost 3 vzipl <1,2,3,4>, <4,1,2,3>
+ 3165560833U, // <1,4,2,2>: Cost 3 ins <1,u,2,2>, lane 1
+ 2130976768U, // <1,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+ 2893237467U, // <1,4,2,4>: Cost 3 vzipl <1,2,3,4>, <4,4,5,6>
+ 1819462966U, // <1,4,2,5>: Cost 2 vzipl <1,2,3,0>, RHS
+ 2131001344U, // <1,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 3165601793U, // <1,4,2,7>: Cost 3 ins <1,u,2,7>, lane 1
+ 1819463209U, // <1,4,2,u>: Cost 2 vzipl <1,2,3,0>, RHS
+ 2091876353U, // <1,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 3027454831U, // <1,4,3,1>: Cost 3 vtrnl <1,2,3,4>, <4,0,1,2>
+ 2091892737U, // <1,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2091900929U, // <1,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2061880528U, // <1,4,3,4>: Cost 2 vtrnr LHS, <4,4,4,4>
+ 2014101842U, // <1,4,3,5>: Cost 2 vtrnr LHS, <0,4,1,5>
+ 2014101852U, // <1,4,3,6>: Cost 2 vtrnr LHS, <0,4,2,6>
+ 2091933697U, // <1,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 2014101845U, // <1,4,3,u>: Cost 2 vtrnr LHS, <0,4,1,u>
+ 2557100134U, // <1,4,4,0>: Cost 3 vext1 <1,1,4,4>, LHS
+ 2557100882U, // <1,4,4,1>: Cost 3 vext1 <1,1,4,4>, <1,1,4,4>
+ 3165708289U, // <1,4,4,2>: Cost 3 ins <1,u,4,2>, lane 1
+ 2819416409U, // <1,4,4,3>: Cost 3 vuzpr <0,1,2,4>, <0,4,2,3>
+ 2131132416U, // <1,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+ 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+ 2758438198U, // <1,4,4,6>: Cost 3 vuzpl <1,1,4,5>, RHS
+ 2819419365U, // <1,4,4,7>: Cost 3 vuzpr <0,1,2,4>, <4,4,6,7>
+ 2131132416U, // <1,4,4,u>: Cost 2 ins <u,4,4,4>, lane 0
+ 1477394554U, // <1,4,5,0>: Cost 2 vext1 <0,1,4,5>, <0,1,4,5>
+ 2955558949U, // <1,4,5,1>: Cost 3 vzipr <0,4,1,5>, <0,0,4,1>
+ 3204931584U, // <1,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0
+ 3165790209U, // <1,4,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+ 1477397814U, // <1,4,5,4>: Cost 2 vext1 <0,1,4,5>, RHS
+ 1821510966U, // <1,4,5,5>: Cost 2 vzipl <1,5,3,7>, RHS
+ 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2092081153U, // <1,4,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+ 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+ 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+ 2557117268U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,4,6>
+ 3165855745U, // <1,4,6,2>: Cost 3 ins <1,u,6,2>, lane 1
+ 2569062662U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,1,4,6>
+ 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+ 2895768886U, // <1,4,6,5>: Cost 3 vzipl <1,6,1,7>, RHS
+ 2131296256U, // <1,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2131304448U, // <1,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+ 2131296256U, // <1,4,6,u>: Cost 2 ins <u,4,6,6>, lane 0
+ 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+ 3165921281U, // <1,4,7,1>: Cost 3 ins <1,u,7,1>, lane 1
+ 3205079040U, // <1,4,7,2>: Cost 3 ins <u,4,7,2>, lane 0
+ 2861297792U, // <1,4,7,3>: Cost 3 vuzpr <7,1,3,4>, <5,7,1,3>
+ 2669778278U, // <1,4,7,4>: Cost 3 vext2 <u,7,1,4>, <7,4,5,6>
+ 3205103616U, // <1,4,7,5>: Cost 3 ins <u,4,7,5>, lane 0
+ 2131369984U, // <1,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 3165970433U, // <1,4,7,7>: Cost 3 ins <1,u,7,7>, lane 1
+ 2131369984U, // <1,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+ 2091876353U, // <1,4,u,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 2091737089U, // <1,4,u,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 2091892737U, // <1,4,u,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2091753473U, // <1,4,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 2061921488U, // <1,4,u,4>: Cost 2 vtrnr LHS, <4,4,4,4>
+ 2014142802U, // <1,4,u,5>: Cost 2 vtrnr LHS, <0,4,1,5>
+ 2014142812U, // <1,4,u,6>: Cost 2 vtrnr LHS, <0,4,2,6>
+ 2091933697U, // <1,4,u,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 2014142805U, // <1,4,u,u>: Cost 2 vtrnr LHS, <0,4,1,u>
+ 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+ 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 1686110310U, // <1,5,0,2>: Cost 2 vuzpl <1,3,5,7>, LHS
+ 3163471875U, // <1,5,0,3>: Cost 3 ins <1,5,0,u>, lane 3
+ 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 3165437953U, // <1,5,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+ 3164045314U, // <1,5,0,6>: Cost 3 ins <1,5,u,6>, lane 2
+ 2090311682U, // <1,5,0,7>: Cost 2 ins <1,5,u,7>, lane 2
+ 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+ 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+ 2091737089U, // <1,5,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+ 2091753473U, // <1,5,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+ 1686111232U, // <1,5,1,5>: Cost 2 vuzpl <1,3,5,7>, <1,3,5,7>
+ 2958181456U, // <1,5,1,6>: Cost 3 vzipr <0,u,1,1>, <1,4,5,6>
+ 2019986742U, // <1,5,1,7>: Cost 2 vtrnr <1,1,1,1>, RHS
+ 2019986743U, // <1,5,1,u>: Cost 2 vtrnr <1,1,1,1>, RHS
+ 2759853734U, // <1,5,2,0>: Cost 3 vuzpl <1,3,5,7>, <2,3,0,1>
+ 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+ 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+ 2090319877U, // <1,5,2,3>: Cost 2 ins <1,5,u,u>, lane 5
+ 2759853774U, // <1,5,2,4>: Cost 3 vuzpl <1,3,5,7>, <2,3,4,5>
+ 2994687194U, // <1,5,2,5>: Cost 3 vzipr <7,0,1,2>, <4,4,5,5>
+ 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+ 2090311682U, // <1,5,2,7>: Cost 2 ins <1,5,u,7>, lane 2
+ 2090319877U, // <1,5,2,u>: Cost 2 ins <1,5,u,u>, lane 5
+ 2091876353U, // <1,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 2089951235U, // <1,5,3,1>: Cost 2 ins <1,5,3,u>, lane 3
+ 2091892737U, // <1,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2091900929U, // <1,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2091909121U, // <1,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 2061881348U, // <1,5,3,5>: Cost 2 vtrnr LHS, <5,5,5,5>
+ 2089951235U, // <1,5,3,6>: Cost 2 ins <1,5,3,u>, lane 3
+ 940363062U, // <1,5,3,7>: Cost 1 vtrnr LHS, RHS
+ 940363063U, // <1,5,3,u>: Cost 1 vtrnr LHS, RHS
+ 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+ 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+ 3164012546U, // <1,5,4,2>: Cost 3 ins <1,5,u,2>, lane 2
+ 3163766787U, // <1,5,4,3>: Cost 3 ins <1,5,4,u>, lane 3
+ 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+ 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 1686113590U, // <1,5,4,6>: Cost 2 vuzpl <1,3,5,7>, RHS
+ 2090311682U, // <1,5,4,7>: Cost 2 ins <1,5,u,7>, lane 2
+ 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+ 2955561954U, // <1,5,5,0>: Cost 3 vzipr <0,4,1,5>, <4,1,5,0>
+ 2955561874U, // <1,5,5,1>: Cost 3 vzipr <0,4,1,5>, <4,0,5,1>
+ 3165782017U, // <1,5,5,2>: Cost 3 ins <1,u,5,2>, lane 1
+ 2955559851U, // <1,5,5,3>: Cost 3 vzipr <0,4,1,5>, <1,2,5,3>
+ 2955561958U, // <1,5,5,4>: Cost 3 vzipr <0,4,1,5>, <4,1,5,4>
+ 2131877888U, // <1,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+ 2955561474U, // <1,5,5,6>: Cost 3 vzipr <0,4,1,5>, <3,4,5,6>
+ 2092081153U, // <1,5,5,7>: Cost 2 ins <1,u,5,7>, lane 1
+ 2092081153U, // <1,5,5,u>: Cost 2 ins <1,u,5,7>, lane 1
+ 2131910656U, // <1,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <1,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <1,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <1,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <1,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <1,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <1,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <1,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <1,5,6,u>: Cost 1 ins RHS, lane 0
+ 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+ 2557199198U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,5,7>
+ 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+ 2759857248U, // <1,5,7,3>: Cost 3 vuzpl <1,3,5,7>, <7,1,3,5>
+ 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+ 2759857510U, // <1,5,7,5>: Cost 3 vuzpl <1,3,5,7>, <7,4,5,6>
+ 2593035086U, // <1,5,7,6>: Cost 3 vext1 <7,1,5,7>, <6,7,0,1>
+ 2132041728U, // <1,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+ 2132041728U, // <1,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+ 2091876353U, // <1,5,u,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+ 1686116142U, // <1,5,u,2>: Cost 2 vuzpl <1,3,5,7>, LHS
+ 2091753473U, // <1,5,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+ 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+ 1686116506U, // <1,5,u,6>: Cost 2 vuzpl <1,3,5,7>, RHS
+ 940404022U, // <1,5,u,7>: Cost 1 vtrnr LHS, RHS
+ 940404023U, // <1,5,u,u>: Cost 1 vtrnr LHS, RHS
+ 3205873664U, // <1,6,0,0>: Cost 3 ins <u,6,0,0>, lane 0
+ 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+ 2132148224U, // <1,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 3087819259U, // <1,6,0,3>: Cost 3 vtrnr <0,1,2,0>, <0,6,2,3>
+ 2620023123U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,6>
+ 3165437953U, // <1,6,0,5>: Cost 3 ins <1,u,0,5>, lane 1
+ 3164708866U, // <1,6,0,6>: Cost 3 ins <1,6,u,6>, lane 2
+ 2954857782U, // <1,6,0,7>: Cost 3 vzipr <0,3,1,0>, RHS
+ 2132148224U, // <1,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0
+ 3205947392U, // <1,6,1,0>: Cost 3 ins <u,6,1,0>, lane 0
+ 2091737089U, // <1,6,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 3005959068U, // <1,6,1,2>: Cost 3 vzipr <u,u,1,1>, <4,0,6,2>
+ 2091753473U, // <1,6,1,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+ 3205988352U, // <1,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0
+ 1745690729U, // <1,6,1,6>: Cost 2 vuzpr <0,1,2,6>, <0,1,2,6>
+ 1884441910U, // <1,6,1,7>: Cost 2 vzipr <0,u,1,1>, RHS
+ 1884441911U, // <1,6,1,u>: Cost 2 vzipr <0,u,1,1>, RHS
+ 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+ 2994687442U, // <1,6,2,1>: Cost 3 vzipr <7,0,1,2>, <4,7,6,1>
+ 2994686876U, // <1,6,2,2>: Cost 3 vzipr <7,0,1,2>, <4,0,6,2>
+ 2132303872U, // <1,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+ 3206053888U, // <1,6,2,4>: Cost 3 ins <u,6,2,4>, lane 0
+ 3165585409U, // <1,6,2,5>: Cost 3 ins <1,u,2,5>, lane 1
+ 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+ 1897057590U, // <1,6,2,7>: Cost 2 vzipr <3,0,1,2>, RHS
+ 1897057591U, // <1,6,2,u>: Cost 2 vzipr <3,0,1,2>, RHS
+ 2061881442U, // <1,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0>
+ 2987396400U, // <1,6,3,1>: Cost 3 vzipr <5,7,1,3>, <4,5,6,1>
+ 2061880652U, // <1,6,3,2>: Cost 2 vtrnr LHS, <4,6,0,2>
+ 2091900929U, // <1,6,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2061881446U, // <1,6,3,4>: Cost 2 vtrnr LHS, <5,6,7,4>
+ 3118078194U, // <1,6,3,5>: Cost 3 vtrnr <5,1,7,3>, <u,6,7,5>
+ 2061880692U, // <1,6,3,6>: Cost 2 vtrnr LHS, <4,6,4,6>
+ 2014103482U, // <1,6,3,7>: Cost 2 vtrnr LHS, <2,6,3,7>
+ 2014103483U, // <1,6,3,u>: Cost 2 vtrnr LHS, <2,6,3,u>
+ 3206168576U, // <1,6,4,0>: Cost 3 ins <u,6,4,0>, lane 0
+ 2761256201U, // <1,6,4,1>: Cost 3 vuzpl <1,5,6,7>, <4,5,1,7>
+ 3164676098U, // <1,6,4,2>: Cost 3 ins <1,6,u,2>, lane 2
+ 3087852027U, // <1,6,4,3>: Cost 3 vtrnr <0,1,2,4>, <0,6,2,3>
+ 3206201344U, // <1,6,4,4>: Cost 3 ins <u,6,4,4>, lane 0
+ 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2132475904U, // <1,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 2954890550U, // <1,6,4,7>: Cost 3 vzipr <0,3,1,4>, RHS
+ 2132475904U, // <1,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0
+ 3164659714U, // <1,6,5,0>: Cost 3 ins <1,6,u,0>, lane 2
+ 3206250496U, // <1,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+ 3003337628U, // <1,6,5,2>: Cost 3 vzipr <u,4,1,5>, <4,0,6,2>
+ 3165790209U, // <1,6,5,3>: Cost 3 ins <1,u,5,3>, lane 1
+ 3206275072U, // <1,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0
+ 3206283264U, // <1,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+ 3003337956U, // <1,6,5,6>: Cost 3 vzipr <u,4,1,5>, <4,4,6,6>
+ 1881820470U, // <1,6,5,7>: Cost 2 vzipr <0,4,1,5>, RHS
+ 1881820471U, // <1,6,5,u>: Cost 2 vzipr <0,4,1,5>, RHS
+ 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+ 2557264742U, // <1,6,6,1>: Cost 3 vext1 <1,1,6,6>, <1,1,6,6>
+ 3165855745U, // <1,6,6,2>: Cost 3 ins <1,u,6,2>, lane 1
+ 2819432955U, // <1,6,6,3>: Cost 3 vuzpr <0,1,2,6>, <0,6,2,3>
+ 3206348800U, // <1,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0
+ 3206356992U, // <1,6,6,5>: Cost 3 ins <u,6,6,5>, lane 0
+ 2132623360U, // <1,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+ 2132631552U, // <1,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0
+ 2132623360U, // <1,6,6,u>: Cost 2 ins <u,6,6,6>, lane 0
+ 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+ 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+ 3206406144U, // <1,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0
+ 3206414336U, // <1,6,7,3>: Cost 3 ins <u,6,7,3>, lane 0
+ 2132680704U, // <1,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+ 2725507979U, // <1,6,7,6>: Cost 3 vext3 <6,u,0,1>, <6,7,6,u>
+ 2132705280U, // <1,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+ 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+ 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+ 2091737089U, // <1,6,u,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 2061921612U, // <1,6,u,2>: Cost 2 vtrnr LHS, <4,6,0,2>
+ 2091753473U, // <1,6,u,3>: Cost 2 ins <1,u,1,3>, lane 1
+ 2061922406U, // <1,6,u,4>: Cost 2 vtrnr LHS, <5,6,7,4>
+ 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+ 2061921652U, // <1,6,u,6>: Cost 2 vtrnr LHS, <4,6,4,6>
+ 2014144442U, // <1,6,u,7>: Cost 2 vtrnr LHS, <2,6,3,7>
+ 2014144443U, // <1,6,u,u>: Cost 2 vtrnr LHS, <2,6,3,u>
+ 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+ 2132803584U, // <1,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+ 3206553600U, // <1,7,0,2>: Cost 3 ins <u,7,0,2>, lane 0
+ 2257286235U, // <1,7,0,3>: Cost 3 vrev <7,1,3,0>
+ 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+ 3206578176U, // <1,7,0,5>: Cost 3 ins <u,7,0,5>, lane 0
+ 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+ 3165380610U, // <1,7,0,7>: Cost 3 ins <1,7,u,7>, lane 2
+ 2132803584U, // <1,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+ 2581184614U, // <1,7,1,0>: Cost 3 vext1 <5,1,7,1>, LHS
+ 2091737089U, // <1,7,1,1>: Cost 2 ins <1,u,1,1>, lane 1
+ 3206627328U, // <1,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0
+ 2132893696U, // <1,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+ 2581187894U, // <1,7,1,4>: Cost 3 vext1 <5,1,7,1>, RHS
+ 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+ 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+ 1745698922U, // <1,7,1,7>: Cost 2 vuzpr <0,1,2,7>, <0,1,2,7>
+ 2132893696U, // <1,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+ 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+ 2994687370U, // <1,7,2,1>: Cost 3 vzipr <7,0,1,2>, <4,6,7,1>
+ 3206701056U, // <1,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0
+ 2132967424U, // <1,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+ 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+ 3206725632U, // <1,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0
+ 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+ 2994688024U, // <1,7,2,7>: Cost 3 vzipr <7,0,1,2>, <5,5,7,7>
+ 2132967424U, // <1,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0
+ 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2061882190U, // <1,7,3,1>: Cost 2 vtrnr LHS, <6,7,0,1>
+ 2091892737U, // <1,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2061881472U, // <1,7,3,3>: Cost 2 vtrnr LHS, <5,7,1,3>
+ 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+ 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+ 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+ 2061881512U, // <1,7,3,7>: Cost 2 vtrnr LHS, <5,7,5,7>
+ 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+ 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+ 3165331458U, // <1,7,4,1>: Cost 3 ins <1,7,u,1>, lane 2
+ 2644585539U, // <1,7,4,2>: Cost 3 vext2 <4,5,1,7>, <4,2,6,7>
+ 2257319007U, // <1,7,4,3>: Cost 3 vrev <7,1,3,4>
+ 3206864896U, // <1,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0
+ 2133131264U, // <1,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+ 3206881280U, // <1,7,4,6>: Cost 3 ins <u,7,4,6>, lane 0
+ 3165380610U, // <1,7,4,7>: Cost 3 ins <1,7,u,7>, lane 2
+ 2133131264U, // <1,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+ 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+ 3028292602U, // <1,7,5,1>: Cost 3 vtrnl <1,3,5,7>, <7,0,1,2>
+ 3165782017U, // <1,7,5,2>: Cost 3 ins <1,u,5,2>, lane 1
+ 3028292704U, // <1,7,5,3>: Cost 3 vtrnl <1,3,5,7>, <7,1,3,5>
+ 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+ 3028292966U, // <1,7,5,5>: Cost 3 vtrnl <1,3,5,7>, <7,4,5,6>
+ 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+ 2133221376U, // <1,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+ 2133221376U, // <1,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+ 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+ 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+ 3206995968U, // <1,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0
+ 3165347842U, // <1,7,6,3>: Cost 3 ins <1,7,u,3>, lane 2
+ 2257409130U, // <1,7,6,4>: Cost 3 vrev <7,1,4,6>
+ 3207020544U, // <1,7,6,5>: Cost 3 ins <u,7,6,5>, lane 0
+ 3207028736U, // <1,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0
+ 2133295104U, // <1,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+ 2133295104U, // <1,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+ 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+ 2861470542U, // <1,7,7,1>: Cost 3 vuzpr <7,1,5,7>, <6,7,0,1>
+ 3165929473U, // <1,7,7,2>: Cost 3 ins <1,u,7,2>, lane 1
+ 2998046416U, // <1,7,7,3>: Cost 3 vzipr <7,5,1,7>, <5,1,7,3>
+ 3207086080U, // <1,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0
+ 2257491060U, // <1,7,7,5>: Cost 3 vrev <7,1,5,7>
+ 3207102464U, // <1,7,7,6>: Cost 3 ins <u,7,7,6>, lane 0
+ 2133368832U, // <1,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+ 2133368832U, // <1,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+ 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+ 2061923150U, // <1,7,u,1>: Cost 2 vtrnr LHS, <6,7,0,1>
+ 2091892737U, // <1,7,u,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2061922432U, // <1,7,u,3>: Cost 2 vtrnr LHS, <5,7,1,3>
+ 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+ 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+ 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+ 2061922472U, // <1,7,u,7>: Cost 2 vtrnr LHS, <5,7,5,7>
+ 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+ 1745707008U, // <1,u,0,0>: Cost 2 vuzpr LHS, <0,0,0,0>
+ 1745707018U, // <1,u,0,1>: Cost 2 vuzpr LHS, <0,0,1,1>
+ 1745707028U, // <1,u,0,2>: Cost 2 vuzpr LHS, <0,0,2,2>
+ 2087624706U, // <1,u,0,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+ 1818155162U, // <1,u,0,5>: Cost 2 vzipl <1,0,3,2>, RHS
+ 2891897040U, // <1,u,0,6>: Cost 3 vzipl <1,0,3,2>, <u,6,3,7>
+ 2088984578U, // <1,u,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1745707025U, // <1,u,0,u>: Cost 2 vuzpr LHS, <0,0,1,u>
+ 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+ 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+ 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+ 671965286U, // <1,u,1,3>: Cost 1 vuzpr LHS, LHS
+ 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+ 1818663066U, // <1,u,1,5>: Cost 2 vzipl <1,1,1,1>, RHS
+ 1952880794U, // <1,u,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS
+ 1884441928U, // <1,u,1,7>: Cost 2 vzipr <0,u,1,1>, RHS
+ 671965291U, // <1,u,1,u>: Cost 1 vuzpr LHS, LHS
+ 1745707926U, // <1,u,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+ 1819465518U, // <1,u,2,1>: Cost 2 vzipl <1,2,3,0>, LHS
+ 1745707172U, // <1,u,2,2>: Cost 2 vuzpr LHS, <0,2,0,2>
+ 1055244288U, // <1,u,2,3>: Cost 1 ins LHS, lane 0
+ 1745707930U, // <1,u,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+ 1819465882U, // <1,u,2,5>: Cost 2 vzipl <1,2,3,0>, RHS
+ 1745707212U, // <1,u,2,6>: Cost 2 vuzpr LHS, <0,2,4,6>
+ 1897057608U, // <1,u,2,7>: Cost 2 vzipr <3,0,1,2>, RHS
+ 1055244288U, // <1,u,2,u>: Cost 1 ins LHS, lane 0
+ 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+ 2014102162U, // <1,u,3,1>: Cost 2 vtrnr LHS, <0,u,1,1>
+ 115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+ 940360349U, // <1,u,3,3>: Cost 1 vtrnr LHS, LHS
+ 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+ 2014102166U, // <1,u,3,5>: Cost 2 vtrnr LHS, <0,u,1,5>
+ 2014102176U, // <1,u,3,6>: Cost 2 vtrnr LHS, <0,u,2,6>
+ 940363305U, // <1,u,3,7>: Cost 1 vtrnr LHS, RHS
+ 940360354U, // <1,u,3,u>: Cost 1 vtrnr LHS, LHS
+ 2088263682U, // <1,u,4,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 2087608322U, // <1,u,4,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 2086952962U, // <1,u,4,2>: Cost 2 ins <1,0,u,2>, lane 2
+ 2087624706U, // <1,u,4,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 1793486032U, // <1,u,4,4>: Cost 2 vuzpr LHS, <4,4,4,4>
+ 1745707346U, // <1,u,4,5>: Cost 2 vuzpr LHS, <0,4,1,5>
+ 1745707356U, // <1,u,4,6>: Cost 2 vuzpr LHS, <0,4,2,6>
+ 2088984578U, // <1,u,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1745707349U, // <1,u,4,u>: Cost 2 vuzpr LHS, <0,4,1,u>
+ 2088263682U, // <1,u,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 1821513518U, // <1,u,5,1>: Cost 2 vzipl <1,5,3,7>, LHS
+ 1954551598U, // <1,u,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS
+ 1881817244U, // <1,u,5,3>: Cost 2 vzipr <0,4,1,5>, LHS
+ 2088296450U, // <1,u,5,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 1821513882U, // <1,u,5,5>: Cost 2 vzipl <1,5,3,7>, RHS
+ 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+ 671968566U, // <1,u,5,7>: Cost 1 vuzpr LHS, RHS
+ 671968567U, // <1,u,5,u>: Cost 1 vuzpr LHS, RHS
+ 1793486946U, // <1,u,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+ 2087608322U, // <1,u,6,1>: Cost 2 ins <1,1,u,1>, lane 2
+ 1793486156U, // <1,u,6,2>: Cost 2 vuzpr LHS, <4,6,0,2>
+ 2087624706U, // <1,u,6,3>: Cost 2 ins <1,1,u,3>, lane 2
+ 1793486950U, // <1,u,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+ 2131951616U, // <1,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 1793486196U, // <1,u,6,6>: Cost 2 vuzpr LHS, <4,6,4,6>
+ 1058226176U, // <1,u,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <1,u,6,u>: Cost 1 ins RHS, lane 0
+ 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+ 1793487694U, // <1,u,7,1>: Cost 2 vuzpr LHS, <6,7,0,1>
+ 2086952962U, // <1,u,7,2>: Cost 2 ins <1,0,u,2>, lane 2
+ 1793486976U, // <1,u,7,3>: Cost 2 vuzpr LHS, <5,7,1,3>
+ 2088296450U, // <1,u,7,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 1793487734U, // <1,u,7,5>: Cost 2 vuzpr LHS, <6,7,4,5>
+ 2131369984U, // <1,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 1793487016U, // <1,u,7,7>: Cost 2 vuzpr LHS, <5,7,5,7>
+ 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+ 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+ 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+ 115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+ 671965853U, // <1,u,u,3>: Cost 1 vuzpr LHS, LHS
+ 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+ 1745707670U, // <1,u,u,5>: Cost 2 vuzpr LHS, <0,u,1,5>
+ 1745707680U, // <1,u,u,6>: Cost 2 vuzpr LHS, <0,u,2,6>
+ 671968809U, // <1,u,u,7>: Cost 1 vuzpr LHS, RHS
+ 671965858U, // <1,u,u,u>: Cost 1 vuzpr LHS, LHS
+ 2128150528U, // <2,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+ 2097635329U, // <2,0,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+ 1691664486U, // <2,0,0,2>: Cost 2 vuzpl <2,3,0,1>, LHS
+ 2826094014U, // <2,0,0,3>: Cost 3 vuzpr <1,2,3,0>, <2,0,1,3>
+ 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+ 2826094772U, // <2,0,0,5>: Cost 3 vuzpr <1,2,3,0>, <3,0,4,5>
+ 3171418113U, // <2,0,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+ 3094529510U, // <2,0,0,7>: Cost 3 vtrnr <1,2,3,0>, <2,0,5,7>
+ 1691664540U, // <2,0,0,u>: Cost 2 vuzpl <2,3,0,1>, LHS
+ 2215927971U, // <2,0,1,0>: Cost 3 vrev <0,2,0,1>
+ 2128232448U, // <2,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1752350822U, // <2,0,1,3>: Cost 2 vuzpr <1,2,3,0>, LHS
+ 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+ 2765407232U, // <2,0,1,5>: Cost 3 vuzpl <2,3,0,1>, <1,3,5,7>
+ 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+ 3166707714U, // <2,0,1,7>: Cost 3 ins <2,0,u,7>, lane 2
+ 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+ 1142194340U, // <2,0,2,0>: Cost 2 vrev <0,2,0,2>
+ 1825374310U, // <2,0,2,1>: Cost 2 vzipl <2,2,2,2>, LHS
+ 1959592038U, // <2,0,2,2>: Cost 2 vtrnl <2,2,2,2>, LHS
+ 2128322560U, // <2,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+ 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+ 2599259856U, // <2,0,2,5>: Cost 3 vext1 <u,2,0,2>, <5,1,7,3>
+ 3088351274U, // <2,0,2,6>: Cost 3 vtrnr <0,2,0,2>, <0,0,4,6>
+ 2599261178U, // <2,0,2,7>: Cost 3 vext1 <u,2,0,2>, <7,0,1,2>
+ 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+ 1879883776U, // <2,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0>
+ 1879885478U, // <2,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1>
+ 1879883940U, // <2,0,3,2>: Cost 2 vzipr LHS, <0,2,0,2>
+ 2097872897U, // <2,0,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+ 2958270630U, // <2,0,3,4>: Cost 3 vzipr LHS, <0,2,0,4>
+ 2826094286U, // <2,0,3,5>: Cost 3 vuzpr <1,2,3,0>, <2,3,4,5>
+ 2958270794U, // <2,0,3,6>: Cost 3 vzipr LHS, <0,4,0,6>
+ 2097905665U, // <2,0,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+ 1879883946U, // <2,0,3,u>: Cost 2 vzipr LHS, <0,2,0,u>
+ 2215952550U, // <2,0,4,0>: Cost 3 vrev <0,2,0,4>
+ 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+ 1960427622U, // <2,0,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS
+ 3171688449U, // <2,0,4,3>: Cost 3 ins <2,u,4,3>, lane 1
+ 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+ 2097963009U, // <2,0,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+ 1691667766U, // <2,0,4,6>: Cost 2 vuzpl <2,3,0,1>, RHS
+ 3171721217U, // <2,0,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+ 1691667784U, // <2,0,4,u>: Cost 2 vuzpl <2,3,0,1>, RHS
+ 3033596068U, // <2,0,5,0>: Cost 3 vtrnl <2,2,5,7>, <0,2,0,2>
+ 2128527360U, // <2,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 2955632804U, // <2,0,5,2>: Cost 3 vzipr <0,4,2,5>, <0,2,0,2>
+ 2216181954U, // <2,0,5,3>: Cost 3 vrev <0,2,3,5>
+ 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+ 2867900420U, // <2,0,5,5>: Cost 3 vuzpr <u,2,3,0>, <5,5,5,5>
+ 3202310144U, // <2,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+ 1752354102U, // <2,0,5,7>: Cost 2 vuzpr <1,2,3,0>, RHS
+ 1752354103U, // <2,0,5,u>: Cost 2 vuzpr <1,2,3,0>, RHS
+ 3088678912U, // <2,0,6,0>: Cost 3 vtrnr <0,2,4,6>, <0,0,0,0>
+ 1828143206U, // <2,0,6,1>: Cost 2 vzipl <2,6,3,7>, LHS
+ 2128609280U, // <2,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 3171835905U, // <2,0,6,3>: Cost 3 ins <2,u,6,3>, lane 1
+ 1142522060U, // <2,0,6,4>: Cost 2 vrev <0,2,4,6>
+ 3171852289U, // <2,0,6,5>: Cost 3 ins <2,u,6,5>, lane 1
+ 2867899764U, // <2,0,6,6>: Cost 3 vuzpr <u,2,3,0>, <4,6,4,6>
+ 2128650240U, // <2,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+ 1142817008U, // <2,0,6,u>: Cost 2 vrev <0,2,u,6>
+ 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+ 2867901262U, // <2,0,7,1>: Cost 3 vuzpr <u,2,3,0>, <6,7,0,1>
+ 2956976292U, // <2,0,7,2>: Cost 3 vzipr <0,6,2,7>, <0,2,0,2>
+ 2867900544U, // <2,0,7,3>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,3>
+ 3171917825U, // <2,0,7,4>: Cost 3 ins <2,u,7,4>, lane 1
+ 2867901302U, // <2,0,7,5>: Cost 3 vuzpr <u,2,3,0>, <6,7,4,5>
+ 3166699522U, // <2,0,7,6>: Cost 3 ins <2,0,u,6>, lane 2
+ 2867900584U, // <2,0,7,7>: Cost 3 vuzpr <u,2,3,0>, <5,7,5,7>
+ 2867900549U, // <2,0,7,u>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,u>
+ 1879924736U, // <2,0,u,0>: Cost 2 vzipr LHS, <0,0,0,0>
+ 1879926438U, // <2,0,u,1>: Cost 2 vzipr LHS, <2,3,0,1>
+ 1879924900U, // <2,0,u,2>: Cost 2 vzipr LHS, <0,2,0,2>
+ 1752351389U, // <2,0,u,3>: Cost 2 vuzpr <1,2,3,0>, LHS
+ 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+ 2097963009U, // <2,0,u,5>: Cost 2 ins <2,u,4,5>, lane 1
+ 1691670682U, // <2,0,u,6>: Cost 2 vuzpl <2,3,0,1>, RHS
+ 1752354345U, // <2,0,u,7>: Cost 2 vuzpr <1,2,3,0>, RHS
+ 1879924906U, // <2,0,u,u>: Cost 2 vzipr LHS, <0,2,0,u>
+ 2763497636U, // <2,1,0,0>: Cost 3 vuzpl <2,0,1,2>, <0,2,0,2>
+ 2097635329U, // <2,1,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+ 2820130966U, // <2,1,0,2>: Cost 3 vuzpr <0,2,3,1>, <3,0,1,2>
+ 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+ 2767487180U, // <2,1,0,4>: Cost 3 vuzpl <2,6,1,3>, <0,2,4,6>
+ 3033842688U, // <2,1,0,5>: Cost 3 vtrnl <2,3,0,1>, <1,3,5,7>
+ 3171418113U, // <2,1,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+ 3171426305U, // <2,1,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+ 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+ 2551546028U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, <0,2,1,1>
+ 2128896000U, // <2,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+ 2954938518U, // <2,1,1,2>: Cost 3 vzipr <0,3,2,1>, <3,0,1,2>
+ 2128912384U, // <2,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+ 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+ 3202670592U, // <2,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0
+ 3202678784U, // <2,1,1,6>: Cost 3 ins <u,1,1,6>, lane 0
+ 2953612553U, // <2,1,1,7>: Cost 3 vzipr <0,1,2,1>, <4,5,1,7>
+ 2128896000U, // <2,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+ 2128961536U, // <2,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <2,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <2,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <2,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <2,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <2,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <2,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <2,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <2,1,2,u>: Cost 1 ins LHS, lane 0
+ 2953625609U, // <2,1,3,0>: Cost 3 vzipr LHS, <0,0,1,0>
+ 1879883786U, // <2,1,3,1>: Cost 2 vzipr LHS, <0,0,1,1>
+ 1879885974U, // <2,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2>
+ 1879884760U, // <2,1,3,3>: Cost 2 vzipr LHS, <1,3,1,3>
+ 2953625856U, // <2,1,3,4>: Cost 3 vzipr LHS, <0,3,1,4>
+ 1879884114U, // <2,1,3,5>: Cost 2 vzipr LHS, <0,4,1,5>
+ 2958270641U, // <2,1,3,6>: Cost 3 vzipr LHS, <0,2,1,6>
+ 2097905665U, // <2,1,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+ 1879883793U, // <2,1,3,u>: Cost 2 vzipr LHS, <0,0,1,u>
+ 3171663873U, // <2,1,4,0>: Cost 3 ins <2,u,4,0>, lane 1
+ 3094561588U, // <2,1,4,1>: Cost 3 vtrnr <1,2,3,4>, <1,1,1,1>
+ 2900378522U, // <2,1,4,2>: Cost 3 vzipl <2,4,1,3>, <1,2,3,4>
+ 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+ 3171696641U, // <2,1,4,4>: Cost 3 ins <2,u,4,4>, lane 1
+ 2097963009U, // <2,1,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+ 2763500854U, // <2,1,4,6>: Cost 3 vuzpl <2,0,1,2>, RHS
+ 3171721217U, // <2,1,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+ 2020819051U, // <2,1,4,u>: Cost 2 vtrnr <1,2,3,4>, LHS
+ 2551578800U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, <0,2,1,5>
+ 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+ 2901001110U, // <2,1,5,2>: Cost 3 vzipl <2,5,0,7>, <1,2,3,0>
+ 2129207296U, // <2,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0
+ 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+ 3202965504U, // <2,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0
+ 3171786753U, // <2,1,5,6>: Cost 3 ins <2,u,5,6>, lane 1
+ 2819910966U, // <2,1,5,7>: Cost 3 vuzpr <0,2,0,1>, RHS
+ 2129207296U, // <2,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0
+ 2551586993U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, <0,2,1,6>
+ 3088679732U, // <2,1,6,1>: Cost 3 vtrnr <0,2,4,6>, <1,1,1,1>
+ 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+ 2014937190U, // <2,1,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS
+ 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+ 2955641170U, // <2,1,6,5>: Cost 3 vzipr <0,4,2,6>, <0,4,1,5>
+ 2901886177U, // <2,1,6,6>: Cost 3 vzipl <2,6,3,7>, <1,6,3,7>
+ 2129313792U, // <2,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+ 2014937195U, // <2,1,6,u>: Cost 2 vtrnr <0,2,4,6>, LHS
+ 3171885057U, // <2,1,7,0>: Cost 3 ins <2,u,7,0>, lane 1
+ 3203080192U, // <2,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0
+ 3001439874U, // <2,1,7,2>: Cost 3 vzipr <u,1,2,7>, <7,u,1,2>
+ 2129354752U, // <2,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+ 3171917825U, // <2,1,7,4>: Cost 3 ins <2,u,7,4>, lane 1
+ 3203112960U, // <2,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0
+ 2222392248U, // <2,1,7,6>: Cost 3 vrev <1,2,6,7>
+ 3171942401U, // <2,1,7,7>: Cost 3 ins <2,u,7,7>, lane 1
+ 2129354752U, // <2,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+ 2128961536U, // <2,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 1879924746U, // <2,1,u,1>: Cost 2 vzipr LHS, <0,0,1,1>
+ 1879926934U, // <2,1,u,2>: Cost 2 vzipr LHS, <3,0,1,2>
+ 1055244288U, // <2,1,u,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <2,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 1879925074U, // <2,1,u,5>: Cost 2 vzipr LHS, <0,4,1,5>
+ 2129010688U, // <2,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2097905665U, // <2,1,u,7>: Cost 2 ins <2,u,3,7>, lane 1
+ 1055244288U, // <2,1,u,u>: Cost 1 ins LHS, lane 0
+ 2020787094U, // <2,2,0,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0>
+ 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 1691156582U, // <2,2,0,2>: Cost 2 vuzpl <2,2,2,2>, LHS
+ 2094260226U, // <2,2,0,3>: Cost 2 ins <2,2,u,3>, lane 2
+ 2819917256U, // <2,2,0,4>: Cost 3 vuzpr <0,2,0,2>, <2,0,2,4>
+ 3168018434U, // <2,2,0,5>: Cost 3 ins <2,2,u,5>, lane 2
+ 2819915818U, // <2,2,0,6>: Cost 3 vuzpr <0,2,0,2>, <0,0,4,6>
+ 3171426305U, // <2,2,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+ 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+ 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+ 1879867492U, // <2,2,1,1>: Cost 2 vzipr <0,1,2,1>, <0,1,2,1>
+ 2094252034U, // <2,2,1,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 1746174054U, // <2,2,1,3>: Cost 2 vuzpr <0,2,0,2>, LHS
+ 3167526915U, // <2,2,1,4>: Cost 3 ins <2,2,1,u>, lane 3
+ 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+ 3203342336U, // <2,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0
+ 3168034818U, // <2,2,1,7>: Cost 3 ins <2,2,u,7>, lane 2
+ 1746174059U, // <2,2,1,u>: Cost 2 vuzpr <0,2,0,2>, LHS
+ 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+ 2093858819U, // <2,2,2,1>: Cost 2 ins <2,2,2,u>, lane 3
+ 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+ 1884520550U, // <2,2,2,3>: Cost 2 vzipr <0,u,2,2>, LHS
+ 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 2093858819U, // <2,2,2,5>: Cost 2 ins <2,2,2,u>, lane 3
+ 2093858819U, // <2,2,2,6>: Cost 2 ins <2,2,2,u>, lane 3
+ 2093858819U, // <2,2,2,7>: Cost 2 ins <2,2,2,u>, lane 3
+ 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+ 2129698816U, // <2,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 2093932547U, // <2,2,3,1>: Cost 2 ins <2,2,3,u>, lane 3
+ 1879885416U, // <2,2,3,2>: Cost 2 vzipr LHS, <2,2,2,2>
+ 806142054U, // <2,2,3,3>: Cost 1 vzipr LHS, LHS
+ 2129731584U, // <2,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 2093932547U, // <2,2,3,5>: Cost 2 ins <2,2,3,u>, lane 3
+ 1884528988U, // <2,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6>
+ 2097905665U, // <2,2,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+ 806142059U, // <2,2,3,u>: Cost 1 vzipr LHS, LHS
+ 2551644344U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, <0,2,2,4>
+ 3171672065U, // <2,2,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+ 2094252034U, // <2,2,4,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 2094260226U, // <2,2,4,3>: Cost 2 ins <2,2,u,3>, lane 2
+ 2020819866U, // <2,2,4,4>: Cost 2 vtrnr <1,2,3,4>, <1,2,3,4>
+ 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 1691159862U, // <2,2,4,6>: Cost 2 vuzpl <2,2,2,2>, RHS
+ 3171721217U, // <2,2,4,7>: Cost 3 ins <2,u,4,7>, lane 1
+ 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+ 3167821827U, // <2,2,5,0>: Cost 3 ins <2,2,5,u>, lane 3
+ 2670497488U, // <2,2,5,1>: Cost 3 vext2 <u,u,2,2>, <5,1,7,3>
+ 2094252034U, // <2,2,5,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 2094260226U, // <2,2,5,3>: Cost 2 ins <2,2,u,3>, lane 2
+ 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+ 1879900264U, // <2,2,5,5>: Cost 2 vzipr <0,1,2,5>, <0,1,2,5>
+ 2670497890U, // <2,2,5,6>: Cost 3 vext2 <u,u,2,2>, <5,6,7,0>
+ 1746177334U, // <2,2,5,7>: Cost 2 vuzpr <0,2,0,2>, RHS
+ 1746177335U, // <2,2,5,u>: Cost 2 vuzpr <0,2,0,2>, RHS
+ 3088679830U, // <2,2,6,0>: Cost 3 vtrnr <0,2,4,6>, <1,2,3,0>
+ 3171819521U, // <2,2,6,1>: Cost 3 ins <2,u,6,1>, lane 1
+ 2094252034U, // <2,2,6,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 1881899110U, // <2,2,6,3>: Cost 2 vzipr <0,4,2,6>, LHS
+ 3088679078U, // <2,2,6,4>: Cost 3 vtrnr <0,2,4,6>, <0,2,0,4>
+ 3171852289U, // <2,2,6,5>: Cost 3 ins <2,u,6,5>, lane 1
+ 2014937292U, // <2,2,6,6>: Cost 2 vtrnr <0,2,4,6>, <0,2,4,6>
+ 2094301189U, // <2,2,6,7>: Cost 2 ins <2,2,u,u>, lane 5
+ 1881899115U, // <2,2,6,u>: Cost 2 vzipr <0,4,2,6>, LHS
+ 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+ 2867696462U, // <2,2,7,1>: Cost 3 vuzpr <u,2,0,2>, <6,7,0,1>
+ 2094252034U, // <2,2,7,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 2130018304U, // <2,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+ 2670499174U, // <2,2,7,4>: Cost 3 vext2 <u,u,2,2>, <7,4,5,6>
+ 2228291208U, // <2,2,7,5>: Cost 3 vrev <2,2,5,7>
+ 3203784704U, // <2,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+ 1879916650U, // <2,2,7,7>: Cost 2 vzipr <0,1,2,7>, <0,1,2,7>
+ 2130018304U, // <2,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0
+ 2020787094U, // <2,2,u,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0>
+ 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+ 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+ 806183014U, // <2,2,u,3>: Cost 1 vzipr LHS, LHS
+ 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+ 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+ 1879925084U, // <2,2,u,6>: Cost 2 vzipr LHS, <0,4,2,6>
+ 1746177577U, // <2,2,u,7>: Cost 2 vuzpr <0,2,0,2>, RHS
+ 806183019U, // <2,2,u,u>: Cost 1 vzipr LHS, LHS
+ 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2094374915U, // <2,3,0,3>: Cost 2 ins <2,3,0,u>, lane 3
+ 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2094940162U, // <2,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 2094374915U, // <2,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3
+ 2094374915U, // <2,3,0,7>: Cost 2 ins <2,3,0,u>, lane 3
+ 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+ 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+ 2094956546U, // <2,3,1,7>: Cost 2 ins <2,3,u,7>, lane 2
+ 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+ 2094522371U, // <2,3,2,0>: Cost 2 ins <2,3,2,u>, lane 3
+ 2094907394U, // <2,3,2,1>: Cost 2 ins <2,3,u,1>, lane 2
+ 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1059889156U, // <2,3,2,3>: Cost 1 ins LHS, lane 4
+ 2094522371U, // <2,3,2,4>: Cost 2 ins <2,3,2,u>, lane 3
+ 2094940162U, // <2,3,2,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2094956546U, // <2,3,2,7>: Cost 2 ins <2,3,u,7>, lane 2
+ 1059889156U, // <2,3,2,u>: Cost 1 ins LHS, lane 4
+ 1879884694U, // <2,3,3,0>: Cost 2 vzipr LHS, <1,2,3,0>
+ 2094907394U, // <2,3,3,1>: Cost 2 ins <2,3,u,1>, lane 2
+ 1879884534U, // <2,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2>
+ 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+ 1879884698U, // <2,3,3,4>: Cost 2 vzipr LHS, <1,2,3,4>
+ 2094940162U, // <2,3,3,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 2953627415U, // <2,3,3,6>: Cost 3 vzipr LHS, <2,4,3,6>
+ 1884529808U, // <2,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7>
+ 1879884702U, // <2,3,3,u>: Cost 2 vzipr LHS, <1,2,3,u>
+ 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+ 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+ 2094669827U, // <2,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3
+ 2094669827U, // <2,3,4,3>: Cost 2 ins <2,3,4,u>, lane 3
+ 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+ 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1691241782U, // <2,3,4,6>: Cost 2 vuzpl <2,2,3,3>, RHS
+ 2094669827U, // <2,3,4,7>: Cost 2 ins <2,3,4,u>, lane 3
+ 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+ 2551726274U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, <0,2,3,5>
+ 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2665860843U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,1,3>
+ 2094923778U, // <2,3,5,3>: Cost 2 ins <2,3,u,3>, lane 2
+ 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 1758350646U, // <2,3,5,7>: Cost 2 vuzpr <2,2,3,3>, RHS
+ 1758350647U, // <2,3,5,u>: Cost 2 vuzpr <2,2,3,3>, RHS
+ 2094817283U, // <2,3,6,0>: Cost 2 ins <2,3,6,u>, lane 3
+ 2094907394U, // <2,3,6,1>: Cost 2 ins <2,3,u,1>, lane 2
+ 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2094923778U, // <2,3,6,3>: Cost 2 ins <2,3,u,3>, lane 2
+ 2094817283U, // <2,3,6,4>: Cost 2 ins <2,3,6,u>, lane 3
+ 2094940162U, // <2,3,6,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1060216836U, // <2,3,6,7>: Cost 1 ins RHS, lane 4
+ 1060216836U, // <2,3,6,u>: Cost 1 ins RHS, lane 4
+ 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2094907394U, // <2,3,7,1>: Cost 2 ins <2,3,u,1>, lane 2
+ 2974892790U, // <2,3,7,2>: Cost 3 vzipr <3,6,2,7>, <1,0,3,2>
+ 2133999620U, // <2,3,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+ 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2094940162U, // <2,3,7,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 2134024196U, // <2,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1879925654U, // <2,3,u,0>: Cost 2 vzipr LHS, <1,2,3,0>
+ 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1879925494U, // <2,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2>
+ 1059889156U, // <2,3,u,3>: Cost 1 ins LHS, lane 4
+ 1879925658U, // <2,3,u,4>: Cost 2 vzipr LHS, <1,2,3,4>
+ 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+ 1060216836U, // <2,3,u,7>: Cost 1 ins RHS, lane 4
+ 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+ 2826125312U, // <2,4,0,0>: Cost 3 vuzpr <1,2,3,4>, <0,0,0,0>
+ 2097635329U, // <2,4,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+ 1691992166U, // <2,4,0,2>: Cost 2 vuzpl <2,3,4,5>, LHS
+ 3171393537U, // <2,4,0,3>: Cost 3 ins <2,u,0,3>, lane 1
+ 2765734092U, // <2,4,0,4>: Cost 3 vuzpl <2,3,4,5>, <0,2,4,6>
+ 3094528338U, // <2,4,0,5>: Cost 3 vtrnr <1,2,3,0>, <0,4,1,5>
+ 1960103222U, // <2,4,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS
+ 3171426305U, // <2,4,0,7>: Cost 3 ins <2,u,0,7>, lane 1
+ 1960103240U, // <2,4,0,u>: Cost 2 vtrnl <2,3,0,1>, RHS
+ 3204620288U, // <2,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+ 2826126132U, // <2,4,1,1>: Cost 3 vuzpr <1,2,3,4>, <1,1,1,1>
+ 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+ 1752383590U, // <2,4,1,3>: Cost 2 vuzpr <1,2,3,4>, LHS
+ 3204653056U, // <2,4,1,4>: Cost 3 ins <u,4,1,4>, lane 0
+ 2130919424U, // <2,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+ 3031936310U, // <2,4,1,6>: Cost 3 vtrnl <2,0,1,2>, RHS
+ 3169361922U, // <2,4,1,7>: Cost 3 ins <2,4,u,7>, lane 2
+ 1752383595U, // <2,4,1,u>: Cost 2 vuzpr <1,2,3,4>, LHS
+ 2826126230U, // <2,4,2,0>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,0>
+ 3171524609U, // <2,4,2,1>: Cost 3 ins <2,u,2,1>, lane 1
+ 2097790977U, // <2,4,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 2130976768U, // <2,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+ 1752384410U, // <2,4,2,4>: Cost 2 vuzpr <1,2,3,4>, <1,2,3,4>
+ 1825377590U, // <2,4,2,5>: Cost 2 vzipl <2,2,2,2>, RHS
+ 1959595318U, // <2,4,2,6>: Cost 2 vtrnl <2,2,2,2>, RHS
+ 3171573761U, // <2,4,2,7>: Cost 3 ins <2,u,2,7>, lane 1
+ 1825377833U, // <2,4,2,u>: Cost 2 vzipl <2,2,2,2>, RHS
+ 2826127049U, // <2,4,3,0>: Cost 3 vuzpr <1,2,3,4>, <2,3,4,0>
+ 2958270501U, // <2,4,3,1>: Cost 3 vzipr LHS, <0,0,4,1>
+ 2958270502U, // <2,4,3,2>: Cost 3 vzipr LHS, <0,0,4,2>
+ 2097872897U, // <2,4,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+ 1927662800U, // <2,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4>
+ 1879885518U, // <2,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5>
+ 1879883980U, // <2,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6>
+ 2097905665U, // <2,4,3,7>: Cost 2 ins <2,u,3,7>, lane 1
+ 1879883982U, // <2,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u>
+ 2563735654U, // <2,4,4,0>: Cost 3 vext1 <2,2,4,4>, LHS
+ 2826127824U, // <2,4,4,1>: Cost 3 vuzpr <1,2,3,4>, <3,4,0,1>
+ 2826127834U, // <2,4,4,2>: Cost 3 vuzpr <1,2,3,4>, <3,4,1,2>
+ 2826127106U, // <2,4,4,3>: Cost 3 vuzpr <1,2,3,4>, <2,4,1,3>
+ 2131132416U, // <2,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+ 2097963009U, // <2,4,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+ 1691995446U, // <2,4,4,6>: Cost 2 vuzpl <2,3,4,5>, RHS
+ 3094562602U, // <2,4,4,7>: Cost 3 vtrnr <1,2,3,4>, <2,4,5,7>
+ 1691995464U, // <2,4,4,u>: Cost 2 vuzpl <2,3,4,5>, RHS
+ 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+ 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+ 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+ 2765737726U, // <2,4,5,3>: Cost 3 vuzpl <2,3,4,5>, <5,2,3,4>
+ 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+ 2131214336U, // <2,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+ 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1752386870U, // <2,4,5,7>: Cost 2 vuzpr <1,2,3,4>, RHS
+ 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478066380U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, <0,2,4,6>
+ 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+ 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+ 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+ 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+ 1828146486U, // <2,4,6,5>: Cost 2 vzipl <2,6,3,7>, RHS
+ 2131296256U, // <2,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2131304448U, // <2,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+ 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+ 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+ 2867934030U, // <2,4,7,1>: Cost 3 vuzpr <u,2,3,4>, <6,7,0,1>
+ 3169320962U, // <2,4,7,2>: Cost 3 ins <2,4,u,2>, lane 2
+ 2867933312U, // <2,4,7,3>: Cost 3 vuzpr <u,2,3,4>, <5,7,1,3>
+ 3205095424U, // <2,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0
+ 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+ 2131369984U, // <2,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 2867933352U, // <2,4,7,7>: Cost 3 vuzpr <u,2,3,4>, <5,7,5,7>
+ 2131369984U, // <2,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+ 1478082766U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, <0,2,4,u>
+ 2097635329U, // <2,4,u,1>: Cost 2 ins <2,u,0,1>, lane 1
+ 1691997998U, // <2,4,u,2>: Cost 2 vuzpl <2,3,4,5>, LHS
+ 1752384157U, // <2,4,u,3>: Cost 2 vuzpr <1,2,3,4>, LHS
+ 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+ 1879926478U, // <2,4,u,5>: Cost 2 vzipr LHS, <2,3,4,5>
+ 1879924940U, // <2,4,u,6>: Cost 2 vzipr LHS, <0,2,4,6>
+ 1752387113U, // <2,4,u,7>: Cost 2 vuzpr <1,2,3,4>, RHS
+ 1879924942U, // <2,4,u,u>: Cost 2 vzipr LHS, <0,2,4,u>
+ 2765160612U, // <2,5,0,0>: Cost 3 vuzpl <2,2,5,7>, <0,2,0,2>
+ 2097635329U, // <2,5,0,1>: Cost 2 ins <2,u,0,1>, lane 1
+ 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+ 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+ 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+ 3136335876U, // <2,5,0,5>: Cost 3 vtrnr <u,2,3,0>, <5,5,5,5>
+ 3171418113U, // <2,5,0,6>: Cost 3 ins <2,u,0,6>, lane 1
+ 2020789558U, // <2,5,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS
+ 2020789559U, // <2,5,0,u>: Cost 2 vtrnr <1,2,3,0>, RHS
+ 2599616614U, // <2,5,1,0>: Cost 3 vext1 <u,2,5,1>, LHS
+ 3205292032U, // <2,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0
+ 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+ 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+ 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+ 2599620736U, // <2,5,1,5>: Cost 3 vext1 <u,2,5,1>, <5,7,1,3>
+ 3205332992U, // <2,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0
+ 2131599360U, // <2,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0
+ 2131599360U, // <2,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0
+ 3171516417U, // <2,5,2,0>: Cost 3 ins <2,u,2,0>, lane 1
+ 3006040978U, // <2,5,2,1>: Cost 3 vzipr <u,u,2,2>, <4,0,5,1>
+ 2097790977U, // <2,5,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 2131640320U, // <2,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+ 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+ 2820014256U, // <2,5,2,5>: Cost 3 vuzpr <0,2,1,5>, <0,2,1,5>
+ 2958264834U, // <2,5,2,6>: Cost 3 vzipr <0,u,2,2>, <3,4,5,6>
+ 2014612790U, // <2,5,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS
+ 2014612791U, // <2,5,2,u>: Cost 2 vtrnr <0,2,0,2>, RHS
+ 2958273506U, // <2,5,3,0>: Cost 3 vzipr LHS, <4,1,5,0>
+ 1927662482U, // <2,5,3,1>: Cost 2 vzipr LHS, <4,0,5,1>
+ 2899955454U, // <2,5,3,2>: Cost 3 vzipl <2,3,4,5>, <5,2,3,4>
+ 2097872897U, // <2,5,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+ 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+ 1927662810U, // <2,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5>
+ 1879886338U, // <2,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6>
+ 1879884800U, // <2,5,3,7>: Cost 2 vzipr LHS, <1,3,5,7>
+ 1879884801U, // <2,5,3,u>: Cost 2 vzipr LHS, <1,3,5,u>
+ 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+ 3171672065U, // <2,5,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+ 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+ 3034173182U, // <2,5,4,3>: Cost 3 vtrnl <2,3,4,5>, <5,2,3,4>
+ 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+ 2097963009U, // <2,5,4,5>: Cost 2 ins <2,u,4,5>, lane 1
+ 2820164098U, // <2,5,4,6>: Cost 3 vuzpr <0,2,3,5>, <3,4,5,6>
+ 2020822326U, // <2,5,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS
+ 2020822327U, // <2,5,4,u>: Cost 2 vtrnr <1,2,3,4>, RHS
+ 2599649382U, // <2,5,5,0>: Cost 3 vext1 <u,2,5,5>, LHS
+ 3003411346U, // <2,5,5,1>: Cost 3 vzipr <u,4,2,5>, <4,0,5,1>
+ 2563819142U, // <2,5,5,2>: Cost 3 vext1 <2,2,5,5>, <2,2,5,5>
+ 2953642113U, // <2,5,5,3>: Cost 3 vzipr <0,1,2,5>, <0,1,5,3>
+ 2599652662U, // <2,5,5,4>: Cost 3 vext1 <u,2,5,5>, RHS
+ 2131877888U, // <2,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+ 2954971650U, // <2,5,5,6>: Cost 3 vzipr <0,3,2,5>, <3,4,5,6>
+ 2131894272U, // <2,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+ 2131877888U, // <2,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0
+ 2131910656U, // <2,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <2,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <2,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <2,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <2,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <2,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <2,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <2,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <2,5,6,u>: Cost 1 ins RHS, lane 0
+ 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+ 2712244352U, // <2,5,7,1>: Cost 3 vext3 <4,6,0,2>, <5,7,1,3>
+ 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+ 2953658497U, // <2,5,7,3>: Cost 3 vzipr <0,1,2,7>, <0,1,5,3>
+ 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+ 2712244392U, // <2,5,7,5>: Cost 3 vext3 <4,6,0,2>, <5,7,5,7>
+ 2712244396U, // <2,5,7,6>: Cost 3 vext3 <4,6,0,2>, <5,7,6,2>
+ 2132041728U, // <2,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0
+ 2132041728U, // <2,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0
+ 2131910656U, // <2,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 1927703442U, // <2,5,u,1>: Cost 2 vzipr LHS, <4,0,5,1>
+ 2097790977U, // <2,5,u,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 2097872897U, // <2,5,u,3>: Cost 2 ins <2,u,3,3>, lane 1
+ 2131943424U, // <2,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 1927703770U, // <2,5,u,5>: Cost 2 vzipr LHS, <4,4,5,5>
+ 1879927298U, // <2,5,u,6>: Cost 2 vzipr LHS, <3,4,5,6>
+ 1058226176U, // <2,5,u,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <2,5,u,u>: Cost 1 ins RHS, lane 0
+ 2820243456U, // <2,6,0,0>: Cost 3 vuzpr <0,2,4,6>, <0,0,0,0>
+ 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2132148224U, // <2,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 3171393537U, // <2,6,0,3>: Cost 3 ins <2,u,0,3>, lane 1
+ 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+ 3170672642U, // <2,6,0,5>: Cost 3 ins <2,6,u,5>, lane 2
+ 3136335220U, // <2,6,0,6>: Cost 3 vtrnr <u,2,3,0>, <4,6,4,6>
+ 2096947202U, // <2,6,0,7>: Cost 2 ins <2,6,u,7>, lane 2
+ 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+ 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+ 2820244276U, // <2,6,1,1>: Cost 3 vuzpr <0,2,4,6>, <1,1,1,1>
+ 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+ 1746501734U, // <2,6,1,3>: Cost 2 vuzpr <0,2,4,6>, LHS
+ 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+ 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+ 3205996544U, // <2,6,1,6>: Cost 3 ins <u,6,1,6>, lane 0
+ 2096947202U, // <2,6,1,7>: Cost 2 ins <2,6,u,7>, lane 2
+ 1746501739U, // <2,6,1,u>: Cost 2 vuzpr <0,2,4,6>, LHS
+ 2820244374U, // <2,6,2,0>: Cost 3 vuzpr <0,2,4,6>, <1,2,3,0>
+ 3171524609U, // <2,6,2,1>: Cost 3 ins <2,u,2,1>, lane 1
+ 2097790977U, // <2,6,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 2096955397U, // <2,6,2,3>: Cost 2 ins <2,6,u,u>, lane 5
+ 2820243622U, // <2,6,2,4>: Cost 3 vuzpr <0,2,4,6>, <0,2,0,4>
+ 3171557377U, // <2,6,2,5>: Cost 3 ins <2,u,2,5>, lane 1
+ 1746501836U, // <2,6,2,6>: Cost 2 vuzpr <0,2,4,6>, <0,2,4,6>
+ 1884523830U, // <2,6,2,7>: Cost 2 vzipr <0,u,2,2>, RHS
+ 1884523831U, // <2,6,2,u>: Cost 2 vzipr <0,u,2,2>, RHS
+ 2096586755U, // <2,6,3,0>: Cost 2 ins <2,6,3,u>, lane 3
+ 2096586755U, // <2,6,3,1>: Cost 2 ins <2,6,3,u>, lane 3
+ 1927662492U, // <2,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2>
+ 2097872897U, // <2,6,3,3>: Cost 2 ins <2,u,3,3>, lane 1
+ 2096586755U, // <2,6,3,4>: Cost 2 ins <2,6,3,u>, lane 3
+ 2096586755U, // <2,6,3,5>: Cost 2 ins <2,6,3,u>, lane 3
+ 1927662820U, // <2,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6>
+ 806145334U, // <2,6,3,7>: Cost 1 vzipr LHS, RHS
+ 806145335U, // <2,6,3,u>: Cost 1 vzipr LHS, RHS
+ 2820245292U, // <2,6,4,0>: Cost 3 vuzpr <0,2,4,6>, <2,4,6,0>
+ 3171672065U, // <2,6,4,1>: Cost 3 ins <2,u,4,1>, lane 1
+ 2820243782U, // <2,6,4,2>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,2>
+ 3171688449U, // <2,6,4,3>: Cost 3 ins <2,u,4,3>, lane 1
+ 2820243784U, // <2,6,4,4>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,4>
+ 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 2132475904U, // <2,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 2096947202U, // <2,6,4,7>: Cost 2 ins <2,6,u,7>, lane 2
+ 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+ 3170476035U, // <2,6,5,0>: Cost 3 ins <2,6,5,u>, lane 3
+ 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+ 3206258688U, // <2,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+ 3170656258U, // <2,6,5,3>: Cost 3 ins <2,6,u,3>, lane 2
+ 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+ 2868023300U, // <2,6,5,5>: Cost 3 vuzpr <u,2,4,6>, <5,5,5,5>
+ 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+ 1746505014U, // <2,6,5,7>: Cost 2 vuzpr <0,2,4,6>, RHS
+ 1746505015U, // <2,6,5,u>: Cost 2 vuzpr <0,2,4,6>, RHS
+ 2955643964U, // <2,6,6,0>: Cost 3 vzipr <0,4,2,6>, <4,2,6,0>
+ 2820246859U, // <2,6,6,1>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,1>
+ 2820246860U, // <2,6,6,2>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,2>
+ 2820245412U, // <2,6,6,3>: Cost 3 vuzpr <0,2,4,6>, <2,6,1,3>
+ 2955643968U, // <2,6,6,4>: Cost 3 vzipr <0,4,2,6>, <4,2,6,4>
+ 2820246899U, // <2,6,6,5>: Cost 3 vuzpr <0,2,4,6>, <4,6,4,5>
+ 2132623360U, // <2,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+ 1881902390U, // <2,6,6,7>: Cost 2 vzipr <0,4,2,6>, RHS
+ 1881902391U, // <2,6,6,u>: Cost 2 vzipr <0,4,2,6>, RHS
+ 2132647936U, // <2,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+ 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+ 3124596044U, // <2,6,7,2>: Cost 3 vtrnr <6,2,5,7>, <4,6,0,2>
+ 2868023424U, // <2,6,7,3>: Cost 3 vuzpr <u,2,4,6>, <5,7,1,3>
+ 2132680704U, // <2,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 2252181996U, // <2,6,7,5>: Cost 3 vrev <6,2,5,7>
+ 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+ 2132705280U, // <2,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0
+ 2132647936U, // <2,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0
+ 2096586755U, // <2,6,u,0>: Cost 2 ins <2,6,3,u>, lane 3
+ 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+ 1927703452U, // <2,6,u,2>: Cost 2 vzipr LHS, <4,0,6,2>
+ 1746502301U, // <2,6,u,3>: Cost 2 vuzpr <0,2,4,6>, LHS
+ 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+ 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+ 1927703780U, // <2,6,u,6>: Cost 2 vzipr LHS, <4,4,6,6>
+ 806186294U, // <2,6,u,7>: Cost 1 vzipr LHS, RHS
+ 806186295U, // <2,6,u,u>: Cost 1 vzipr LHS, RHS
+ 2581839974U, // <2,7,0,0>: Cost 3 vext1 <5,2,7,0>, LHS
+ 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+ 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+ 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+ 2581843254U, // <2,7,0,4>: Cost 3 vext1 <5,2,7,0>, RHS
+ 2581843742U, // <2,7,0,5>: Cost 3 vext1 <5,2,7,0>, <5,2,7,0>
+ 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+ 3136336040U, // <2,7,0,7>: Cost 3 vtrnr <u,2,3,0>, <5,7,5,7>
+ 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+ 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+ 3206619136U, // <2,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0
+ 3206627328U, // <2,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0
+ 2132893696U, // <2,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+ 2599767350U, // <2,7,1,4>: Cost 3 vext1 <u,2,7,1>, RHS
+ 3206651904U, // <2,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0
+ 3171344386U, // <2,7,1,6>: Cost 3 ins <2,7,u,6>, lane 2
+ 2599769082U, // <2,7,1,7>: Cost 3 vext1 <u,2,7,1>, <7,0,1,2>
+ 2132893696U, // <2,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+ 2581856358U, // <2,7,2,0>: Cost 3 vext1 <5,2,7,2>, LHS
+ 3136131918U, // <2,7,2,1>: Cost 3 vtrnr <u,2,0,2>, <6,7,0,1>
+ 2097790977U, // <2,7,2,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 2132967424U, // <2,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+ 2581859638U, // <2,7,2,4>: Cost 3 vext1 <5,2,7,2>, RHS
+ 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+ 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+ 1770548291U, // <2,7,2,7>: Cost 2 vuzpr <4,2,6,7>, <4,2,6,7>
+ 2097790977U, // <2,7,2,u>: Cost 2 ins <2,u,2,2>, lane 1
+ 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+ 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+ 1927663312U, // <2,7,3,3>: Cost 2 vzipr LHS, <5,1,7,3>
+ 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+ 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+ 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+ 1927663640U, // <2,7,3,7>: Cost 2 vzipr LHS, <5,5,7,7>
+ 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+ 2581872742U, // <2,7,4,0>: Cost 3 vext1 <5,2,7,4>, LHS
+ 2581873562U, // <2,7,4,1>: Cost 3 vext1 <5,2,7,4>, <1,2,3,4>
+ 3171680257U, // <2,7,4,2>: Cost 3 ins <2,u,4,2>, lane 1
+ 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+ 2581876022U, // <2,7,4,4>: Cost 3 vext1 <5,2,7,4>, RHS
+ 2133131264U, // <2,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+ 2712245609U, // <2,7,4,6>: Cost 3 vext3 <4,6,0,2>, <7,4,6,0>
+ 3136368808U, // <2,7,4,7>: Cost 3 vtrnr <u,2,3,4>, <5,7,5,7>
+ 2133131264U, // <2,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+ 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+ 3206914048U, // <2,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0
+ 2844290353U, // <2,7,5,2>: Cost 3 vuzpr <4,2,6,7>, <4,5,6,2>
+ 2991469050U, // <2,7,5,3>: Cost 3 vzipr <6,4,2,5>, <6,2,7,3>
+ 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+ 3206946816U, // <2,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0
+ 3206955008U, // <2,7,5,6>: Cost 3 ins <u,7,5,6>, lane 0
+ 2133221376U, // <2,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0
+ 2133221376U, // <2,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0
+ 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+ 3136459598U, // <2,7,6,1>: Cost 3 vtrnr <u,2,4,6>, <6,7,0,1>
+ 2901890250U, // <2,7,6,2>: Cost 3 vzipl <2,6,3,7>, <7,2,6,3>
+ 3136458880U, // <2,7,6,3>: Cost 3 vtrnr <u,2,4,6>, <5,7,1,3>
+ 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+ 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+ 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+ 2133295104U, // <2,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+ 2133295104U, // <2,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+ 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+ 3207061504U, // <2,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0
+ 2563983002U, // <2,7,7,2>: Cost 3 vext1 <2,2,7,7>, <2,2,7,7>
+ 2998784506U, // <2,7,7,3>: Cost 3 vzipr <7,6,2,7>, <6,2,7,3>
+ 2599816502U, // <2,7,7,4>: Cost 3 vext1 <u,2,7,7>, RHS
+ 3207094272U, // <2,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0
+ 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+ 2133368832U, // <2,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+ 2133368832U, // <2,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+ 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+ 2097790977U, // <2,7,u,2>: Cost 2 ins <2,u,2,2>, lane 1
+ 1927704272U, // <2,7,u,3>: Cost 2 vzipr LHS, <5,1,7,3>
+ 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+ 2133131264U, // <2,7,u,5>: Cost 2 ins <u,7,4,5>, lane 0
+ 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+ 1927704600U, // <2,7,u,7>: Cost 2 vzipr LHS, <5,5,7,7>
+ 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+ 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+ 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2020786845U, // <2,u,0,3>: Cost 2 vtrnr <1,2,3,0>, LHS
+ 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2094940162U, // <2,u,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 1960106138U, // <2,u,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS
+ 2020789801U, // <2,u,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS
+ 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+ 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+ 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+ 2096947202U, // <2,u,1,7>: Cost 2 ins <2,6,u,7>, lane 2
+ 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+ 1478328556U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, <0,2,u,2>
+ 1825380142U, // <2,u,2,1>: Cost 2 vzipl <2,2,2,2>, LHS
+ 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+ 1055244288U, // <2,u,2,3>: Cost 1 ins LHS, lane 0
+ 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+ 1825380506U, // <2,u,2,5>: Cost 2 vzipl <2,2,2,2>, RHS
+ 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2014613033U, // <2,u,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS
+ 1055244288U, // <2,u,2,u>: Cost 1 ins LHS, lane 0
+ 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1879885550U, // <2,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1>
+ 1879884012U, // <2,u,3,2>: Cost 2 vzipr LHS, <0,2,u,2>
+ 806142108U, // <2,u,3,3>: Cost 1 vzipr LHS, LHS
+ 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 1879885554U, // <2,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5>
+ 1879884016U, // <2,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6>
+ 806145352U, // <2,u,3,7>: Cost 1 vzipr LHS, RHS
+ 806142113U, // <2,u,3,u>: Cost 1 vzipr LHS, LHS
+ 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+ 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+ 1960433454U, // <2,u,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS
+ 2020819613U, // <2,u,4,3>: Cost 2 vtrnr <1,2,3,4>, LHS
+ 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+ 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+ 1691610422U, // <2,u,4,6>: Cost 2 vuzpl <2,2,u,3>, RHS
+ 2020822569U, // <2,u,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS
+ 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+ 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+ 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+ 2094252034U, // <2,u,5,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 2094260226U, // <2,u,5,3>: Cost 2 ins <2,2,u,3>, lane 2
+ 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1746226486U, // <2,u,5,7>: Cost 2 vuzpr <0,2,0,u>, RHS
+ 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+ 1478361328U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, <0,2,u,6>
+ 1828149038U, // <2,u,6,1>: Cost 2 vzipl <2,6,3,7>, LHS
+ 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+ 2014937757U, // <2,u,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS
+ 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+ 1828149402U, // <2,u,6,5>: Cost 2 vzipl <2,6,3,7>, RHS
+ 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1060216836U, // <2,u,6,7>: Cost 1 ins RHS, lane 4
+ 1060216836U, // <2,u,6,u>: Cost 1 ins RHS, lane 4
+ 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+ 2094907394U, // <2,u,7,1>: Cost 2 ins <2,3,u,1>, lane 2
+ 2094252034U, // <2,u,7,2>: Cost 2 ins <2,2,u,2>, lane 2
+ 2129354752U, // <2,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+ 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+ 2094940162U, // <2,u,7,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 2134024196U, // <2,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+ 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+ 1879925699U, // <2,u,u,0>: Cost 2 vzipr LHS, <1,2,u,0>
+ 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+ 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+ 806183068U, // <2,u,u,3>: Cost 1 vzipr LHS, LHS
+ 1879925703U, // <2,u,u,4>: Cost 2 vzipr LHS, <1,2,u,4>
+ 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+ 1879924976U, // <2,u,u,6>: Cost 2 vzipr LHS, <0,2,u,6>
+ 806186312U, // <2,u,u,7>: Cost 1 vzipr LHS, RHS
+ 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+ 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2960312624U, // <3,0,0,3>: Cost 3 vzipr <1,2,3,0>, <3,2,0,3>
+ 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+ 3177381889U, // <3,0,0,5>: Cost 3 ins <3,u,0,5>, lane 1
+ 3177390081U, // <3,0,0,6>: Cost 3 ins <3,u,0,6>, lane 1
+ 3177398273U, // <3,0,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+ 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+ 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+ 2128232448U, // <3,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+ 2098429955U, // <3,0,1,3>: Cost 2 ins <3,0,1,u>, lane 3
+ 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+ 2098429955U, // <3,0,1,5>: Cost 2 ins <3,0,1,u>, lane 3
+ 2098429955U, // <3,0,1,6>: Cost 2 ins <3,0,1,u>, lane 3
+ 2098429955U, // <3,0,1,7>: Cost 2 ins <3,0,1,u>, lane 3
+ 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+ 2128314368U, // <3,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2098946053U, // <3,0,2,3>: Cost 2 ins <3,0,u,u>, lane 5
+ 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2959000610U, // <3,0,2,5>: Cost 3 vzipr <1,0,3,2>, <1,4,0,5>
+ 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+ 3177545729U, // <3,0,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+ 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+ 2820636924U, // <3,0,3,0>: Cost 3 vuzpr <0,3,1,0>, <0,3,1,0>
+ 1832091750U, // <3,0,3,1>: Cost 2 vzipl <3,3,3,3>, LHS
+ 1966309478U, // <3,0,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS
+ 2103844865U, // <3,0,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+ 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+ 2772716034U, // <3,0,3,5>: Cost 3 vuzpl <3,5,0,2>, <3,4,5,6>
+ 3177611265U, // <3,0,3,6>: Cost 3 ins <3,u,3,6>, lane 1
+ 3177619457U, // <3,0,3,7>: Cost 3 ins <3,u,3,7>, lane 1
+ 1832092317U, // <3,0,3,u>: Cost 2 vzipl <3,3,3,3>, LHS
+ 2689835334U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,2>
+ 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 2906669312U, // <3,0,4,3>: Cost 3 vzipl <3,4,5,6>, <0,3,1,4>
+ 2689835373U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,5>
+ 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2769382710U, // <3,0,4,6>: Cost 3 vuzpl <3,0,0,0>, RHS
+ 3177693185U, // <3,0,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+ 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 3101278208U, // <3,0,5,0>: Cost 3 vtrnr <2,3,4,5>, <0,0,0,0>
+ 2128527360U, // <3,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 1967145062U, // <3,0,5,2>: Cost 2 vtrnl <3,4,5,6>, LHS
+ 3040886978U, // <3,0,5,3>: Cost 3 vtrnl <3,4,5,6>, <0,2,3,5>
+ 3040886988U, // <3,0,5,4>: Cost 3 vtrnl <3,4,5,6>, <0,2,4,6>
+ 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+ 2104016897U, // <3,0,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 2820640054U, // <3,0,5,7>: Cost 3 vuzpr <0,3,1,0>, RHS
+ 1967145116U, // <3,0,5,u>: Cost 2 vtrnl <3,4,5,6>, LHS
+ 3202334720U, // <3,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0
+ 2907635814U, // <3,0,6,1>: Cost 3 vzipl <3,6,0,7>, LHS
+ 2128609280U, // <3,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 3177807873U, // <3,0,6,3>: Cost 3 ins <3,u,6,3>, lane 1
+ 3202367488U, // <3,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+ 3172663298U, // <3,0,6,5>: Cost 3 ins <3,0,u,5>, lane 2
+ 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+ 2098946053U, // <3,0,6,7>: Cost 2 ins <3,0,u,u>, lane 5
+ 2128609280U, // <3,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+ 3095396352U, // <3,0,7,0>: Cost 3 vtrnr <1,3,5,7>, <0,0,0,0>
+ 3095396362U, // <3,0,7,1>: Cost 3 vtrnr <1,3,5,7>, <0,0,1,1>
+ 2098896898U, // <3,0,7,2>: Cost 2 ins <3,0,u,2>, lane 2
+ 3177881601U, // <3,0,7,3>: Cost 3 ins <3,u,7,3>, lane 1
+ 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+ 3177897985U, // <3,0,7,5>: Cost 3 ins <3,u,7,5>, lane 1
+ 3202457600U, // <3,0,7,6>: Cost 3 ins <u,0,7,6>, lane 0
+ 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+ 2098896898U, // <3,0,7,u>: Cost 2 ins <3,0,u,2>, lane 2
+ 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+ 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+ 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+ 2098429955U, // <3,0,u,3>: Cost 2 ins <3,0,1,u>, lane 3
+ 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+ 2098429955U, // <3,0,u,6>: Cost 2 ins <3,0,1,u>, lane 3
+ 2098429955U, // <3,0,u,7>: Cost 2 ins <3,0,1,u>, lane 3
+ 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+ 2552201468U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, <0,3,1,0>
+ 2128822272U, // <3,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+ 1695727718U, // <3,1,0,2>: Cost 2 vuzpl <3,0,1,2>, LHS
+ 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+ 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+ 2960310610U, // <3,1,0,5>: Cost 3 vzipr <1,2,3,0>, <0,4,1,5>
+ 2832516572U, // <3,1,0,6>: Cost 3 vuzpr <2,3,0,1>, <2,0,4,6>
+ 3177398273U, // <3,1,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+ 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+ 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+ 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+ 2103689217U, // <3,1,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+ 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+ 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+ 3177463809U, // <3,1,1,6>: Cost 3 ins <3,u,1,6>, lane 1
+ 3100952848U, // <3,1,1,7>: Cost 3 vtrnr <2,3,0,1>, <3,1,5,7>
+ 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+ 2128961536U, // <3,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <3,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <3,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <3,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <3,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <3,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <3,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <3,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <3,1,2,u>: Cost 1 ins LHS, lane 0
+ 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+ 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+ 2021326950U, // <3,1,3,3>: Cost 2 vtrnr <1,3,1,3>, LHS
+ 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+ 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+ 2832516096U, // <3,1,3,7>: Cost 3 vuzpr <2,3,0,1>, <1,3,5,7>
+ 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+ 2552234240U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, <0,3,1,4>
+ 2960343050U, // <3,1,4,1>: Cost 3 vzipr <1,2,3,4>, <0,0,1,1>
+ 2960345238U, // <3,1,4,2>: Cost 3 vzipr <1,2,3,4>, <3,0,1,2>
+ 2129133568U, // <3,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0
+ 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+ 2129149952U, // <3,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+ 1695730998U, // <3,1,4,6>: Cost 2 vuzpl <3,0,1,2>, RHS
+ 3177693185U, // <3,1,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+ 1695731016U, // <3,1,4,u>: Cost 2 vuzpl <3,0,1,2>, RHS
+ 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+ 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+ 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+ 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+ 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+ 2961678674U, // <3,1,5,5>: Cost 3 vzipr <1,4,3,5>, <0,4,1,5>
+ 2104016897U, // <3,1,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 1758776630U, // <3,1,5,7>: Cost 2 vuzpr <2,3,0,1>, RHS
+ 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+ 2907783926U, // <3,1,6,0>: Cost 3 vzipl <3,6,2,7>, <1,0,3,2>
+ 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+ 2222752740U, // <3,1,6,2>: Cost 3 vrev <1,3,2,6>
+ 2129281024U, // <3,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0
+ 2222900214U, // <3,1,6,4>: Cost 3 vrev <1,3,4,6>
+ 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+ 2868350324U, // <3,1,6,6>: Cost 3 vuzpr <u,3,0,1>, <4,6,4,6>
+ 2129313792U, // <3,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+ 2129281024U, // <3,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0
+ 3177857025U, // <3,1,7,0>: Cost 3 ins <3,u,7,0>, lane 1
+ 3095397172U, // <3,1,7,1>: Cost 3 vtrnr <1,3,5,7>, <1,1,1,1>
+ 2962360470U, // <3,1,7,2>: Cost 3 vzipr <1,5,3,7>, <3,0,1,2>
+ 2021654630U, // <3,1,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS
+ 3177889793U, // <3,1,7,4>: Cost 3 ins <3,u,7,4>, lane 1
+ 1149240320U, // <3,1,7,5>: Cost 2 vrev <1,3,5,7>
+ 2223055881U, // <3,1,7,6>: Cost 3 vrev <1,3,6,7>
+ 2868351144U, // <3,1,7,7>: Cost 3 vuzpr <u,3,0,1>, <5,7,5,7>
+ 2021654635U, // <3,1,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS
+ 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+ 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+ 1695733550U, // <3,1,u,2>: Cost 2 vuzpl <3,0,1,2>, LHS
+ 1055244288U, // <3,1,u,3>: Cost 1 ins LHS, lane 0
+ 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+ 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+ 1695733914U, // <3,1,u,6>: Cost 2 vuzpl <3,0,1,2>, RHS
+ 1758776873U, // <3,1,u,7>: Cost 2 vuzpr <2,3,0,1>, RHS
+ 1055244288U, // <3,1,u,u>: Cost 1 ins LHS, lane 0
+ 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+ 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2129494016U, // <3,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 1886568550U, // <3,2,0,3>: Cost 2 vzipr <1,2,3,0>, LHS
+ 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+ 2960311348U, // <3,2,0,5>: Cost 3 vzipr <1,2,3,0>, <1,4,2,5>
+ 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+ 3177398273U, // <3,2,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+ 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+ 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+ 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+ 2103689217U, // <3,2,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+ 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+ 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+ 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+ 3177472001U, // <3,2,1,7>: Cost 3 ins <3,u,1,7>, lane 1
+ 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+ 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+ 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+ 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+ 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+ 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+ 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+ 2689836685U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,3>
+ 3177545729U, // <3,2,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+ 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+ 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+ 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+ 1611450042U, // <3,2,3,2>: Cost 2 vext3 LHS, <2,3,2,3>
+ 1885929574U, // <3,2,3,3>: Cost 2 vzipr <1,1,3,3>, LHS
+ 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+ 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+ 1611450082U, // <3,2,3,6>: Cost 2 vext3 LHS, <2,3,6,7>
+ 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+ 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+ 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+ 2558280674U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,3,2,4>
+ 2960343060U, // <3,2,4,2>: Cost 3 vzipr <1,2,3,4>, <0,0,2,2>
+ 1886601318U, // <3,2,4,3>: Cost 2 vzipr <1,2,3,4>, LHS
+ 2960344034U, // <3,2,4,4>: Cost 3 vzipr <1,2,3,4>, <1,3,2,4>
+ 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2129821696U, // <3,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 3177693185U, // <3,2,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+ 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2552316170U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, <0,3,2,5>
+ 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+ 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+ 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+ 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+ 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+ 2104016897U, // <3,2,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 2826554678U, // <3,2,5,7>: Cost 3 vuzpr <1,3,0,2>, RHS
+ 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+ 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+ 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+ 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+ 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+ 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+ 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+ 2129977344U, // <3,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+ 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+ 3095397270U, // <3,2,7,0>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,0>
+ 3203743744U, // <3,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+ 3095396516U, // <3,2,7,2>: Cost 3 vtrnr <1,3,5,7>, <0,2,0,2>
+ 1888616550U, // <3,2,7,3>: Cost 2 vzipr <1,5,3,7>, LHS
+ 3095397274U, // <3,2,7,4>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,4>
+ 3095396528U, // <3,2,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,2,1,5>
+ 1155286754U, // <3,2,7,6>: Cost 2 vrev <2,3,6,7>
+ 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+ 1888616555U, // <3,2,7,u>: Cost 2 vzipr <1,5,3,7>, LHS
+ 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+ 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+ 2129494016U, // <3,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+ 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+ 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+ 2129821696U, // <3,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 2129977344U, // <3,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0
+ 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+ 1886569366U, // <3,3,0,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0>
+ 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+ 1697874022U, // <3,3,0,2>: Cost 2 vuzpl <3,3,3,3>, LHS
+ 2100895746U, // <3,3,0,3>: Cost 2 ins <3,3,u,3>, lane 2
+ 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+ 3041151490U, // <3,3,0,5>: Cost 3 vtrnl <3,5,0,2>, <3,4,5,6>
+ 3177390081U, // <3,3,0,6>: Cost 3 ins <3,u,0,6>, lane 1
+ 2960311440U, // <3,3,0,7>: Cost 3 vzipr <1,2,3,0>, <1,5,3,7>
+ 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+ 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+ 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+ 2103689217U, // <3,3,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1752891494U, // <3,3,1,3>: Cost 2 vuzpr <1,3,1,3>, LHS
+ 2826635515U, // <3,3,1,4>: Cost 3 vuzpr <1,3,1,3>, <3,1,3,4>
+ 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+ 3177463809U, // <3,3,1,6>: Cost 3 ins <3,u,1,6>, lane 1
+ 3100951552U, // <3,3,1,7>: Cost 3 vtrnr <2,3,0,1>, <1,3,5,7>
+ 1752891499U, // <3,3,1,u>: Cost 2 vuzpr <1,3,1,3>, LHS
+ 2959000470U, // <3,3,2,0>: Cost 3 vzipr <1,0,3,2>, <1,2,3,0>
+ 2959000471U, // <3,3,2,1>: Cost 3 vzipr <1,0,3,2>, <1,2,3,1>
+ 1885258486U, // <3,3,2,2>: Cost 2 vzipr <1,0,3,2>, <1,0,3,2>
+ 2130313216U, // <3,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+ 2959000474U, // <3,3,2,4>: Cost 3 vzipr <1,0,3,2>, <1,2,3,4>
+ 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+ 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+ 2959000720U, // <3,3,2,7>: Cost 3 vzipr <1,0,3,2>, <1,5,3,7>
+ 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+ 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+ 2100568067U, // <3,3,3,1>: Cost 2 ins <3,3,3,u>, lane 3
+ 2100568067U, // <3,3,3,2>: Cost 2 ins <3,3,3,u>, lane 3
+ 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 2100568067U, // <3,3,3,5>: Cost 2 ins <3,3,3,u>, lane 3
+ 2100568067U, // <3,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3
+ 2100568067U, // <3,3,3,7>: Cost 2 ins <3,3,3,u>, lane 3
+ 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+ 2960343958U, // <3,3,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,3,0>
+ 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+ 2960343798U, // <3,3,4,2>: Cost 3 vzipr <1,2,3,4>, <1,0,3,2>
+ 2100895746U, // <3,3,4,3>: Cost 2 ins <3,3,u,3>, lane 2
+ 1886602138U, // <3,3,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+ 1697877302U, // <3,3,4,6>: Cost 2 vuzpl <3,3,3,3>, RHS
+ 2960344208U, // <3,3,4,7>: Cost 3 vzipr <1,2,3,4>, <1,5,3,7>
+ 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+ 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+ 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+ 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+ 2100895746U, // <3,3,5,3>: Cost 2 ins <3,3,u,3>, lane 2
+ 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+ 2027538126U, // <3,3,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5>
+ 2104016897U, // <3,3,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 1752894774U, // <3,3,5,7>: Cost 2 vuzpr <1,3,1,3>, RHS
+ 1752894775U, // <3,3,5,u>: Cost 2 vuzpr <1,3,1,3>, RHS
+ 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+ 3204333568U, // <3,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0
+ 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+ 2100895746U, // <3,3,6,3>: Cost 2 ins <3,3,u,3>, lane 2
+ 2234845608U, // <3,3,6,4>: Cost 3 vrev <3,3,4,6>
+ 3204366336U, // <3,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+ 1967893085U, // <3,3,6,6>: Cost 2 vtrnl <3,5,6,7>, <3,5,6,7>
+ 2130640896U, // <3,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+ 2100895746U, // <3,3,6,u>: Cost 2 ins <3,3,u,3>, lane 2
+ 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+ 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+ 2962359030U, // <3,3,7,2>: Cost 3 vzipr <1,5,3,7>, <1,0,3,2>
+ 2100895746U, // <3,3,7,3>: Cost 2 ins <3,3,u,3>, lane 2
+ 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+ 3095398094U, // <3,3,7,5>: Cost 3 vtrnr <1,3,5,7>, <2,3,4,5>
+ 3174662146U, // <3,3,7,6>: Cost 3 ins <3,3,u,6>, lane 2
+ 2021655552U, // <3,3,7,7>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7>
+ 2021655552U, // <3,3,7,u>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7>
+ 1886569366U, // <3,3,u,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0>
+ 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+ 1697879854U, // <3,3,u,2>: Cost 2 vuzpl <3,3,3,3>, LHS
+ 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+ 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+ 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+ 1697880218U, // <3,3,u,6>: Cost 2 vuzpl <3,3,3,3>, RHS
+ 1752895017U, // <3,3,u,7>: Cost 2 vuzpr <1,3,1,3>, RHS
+ 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+ 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+ 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+ 3177365505U, // <3,4,0,3>: Cost 3 ins <3,u,0,3>, lane 1
+ 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+ 1829948726U, // <3,4,0,5>: Cost 2 vzipl <3,0,1,2>, RHS
+ 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+ 3177398273U, // <3,4,0,7>: Cost 3 ins <3,u,0,7>, lane 1
+ 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+ 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+ 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+ 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+ 2820669542U, // <3,4,1,3>: Cost 3 vuzpr <0,3,1,4>, LHS
+ 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+ 2130919424U, // <3,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+ 1964166454U, // <3,4,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS
+ 3177472001U, // <3,4,1,7>: Cost 3 ins <3,u,1,7>, lane 1
+ 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+ 3204694016U, // <3,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+ 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+ 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+ 2101600261U, // <3,4,2,3>: Cost 2 ins <3,4,u,u>, lane 5
+ 2826716058U, // <3,4,2,4>: Cost 3 vuzpr <1,3,2,4>, <1,2,3,4>
+ 2959001294U, // <3,4,2,5>: Cost 3 vzipr <1,0,3,2>, <2,3,4,5>
+ 2131001344U, // <3,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 3177545729U, // <3,4,2,7>: Cost 3 ins <3,u,2,7>, lane 1
+ 2101600261U, // <3,4,2,u>: Cost 2 ins <3,4,u,u>, lane 5
+ 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+ 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+ 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+ 2103844865U, // <3,4,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+ 2820669696U, // <3,4,3,4>: Cost 3 vuzpr <0,3,1,4>, <0,3,1,4>
+ 1832095030U, // <3,4,3,5>: Cost 2 vzipl <3,3,3,3>, RHS
+ 1966312758U, // <3,4,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS
+ 3177619457U, // <3,4,3,7>: Cost 3 ins <3,u,3,7>, lane 1
+ 1832095273U, // <3,4,3,u>: Cost 2 vzipl <3,3,3,3>, RHS
+ 2960344777U, // <3,4,4,0>: Cost 3 vzipr <1,2,3,4>, <2,3,4,0>
+ 2960344778U, // <3,4,4,1>: Cost 3 vzipr <1,2,3,4>, <2,3,4,1>
+ 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+ 2960344618U, // <3,4,4,3>: Cost 3 vzipr <1,2,3,4>, <2,1,4,3>
+ 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+ 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+ 3177693185U, // <3,4,4,7>: Cost 3 ins <3,u,4,7>, lane 1
+ 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+ 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+ 2101379075U, // <3,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3
+ 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+ 2101379075U, // <3,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3
+ 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+ 2131214336U, // <3,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+ 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+ 2101379075U, // <3,4,5,7>: Cost 2 ins <3,4,5,u>, lane 3
+ 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1659227468U, // <3,4,6,0>: Cost 2 vext3 LHS, <4,6,0,2>
+ 2689838422U, // <3,4,6,1>: Cost 3 vext3 LHS, <4,6,1,3>
+ 2564417231U, // <3,4,6,2>: Cost 3 vext1 <2,3,4,6>, <2,3,4,6>
+ 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+ 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+ 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+ 2131296256U, // <3,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2101600261U, // <3,4,6,7>: Cost 2 ins <3,4,u,u>, lane 5
+ 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+ 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+ 2659972191U, // <3,4,7,1>: Cost 3 vext2 <7,1,3,4>, <7,1,3,4>
+ 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+ 3177881601U, // <3,4,7,3>: Cost 3 ins <3,u,7,3>, lane 1
+ 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+ 3095396690U, // <3,4,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,4,1,5>
+ 2131369984U, // <3,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+ 2131369984U, // <3,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0
+ 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+ 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+ 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+ 2101600261U, // <3,4,u,3>: Cost 2 ins <3,4,u,u>, lane 5
+ 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+ 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+ 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+ 2101379075U, // <3,4,u,7>: Cost 2 ins <3,4,5,u>, lane 3
+ 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2832842752U, // <3,5,0,0>: Cost 3 vuzpr <2,3,4,5>, <0,0,0,0>
+ 2131476480U, // <3,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+ 1698709606U, // <3,5,0,2>: Cost 2 vuzpl <3,4,5,6>, LHS
+ 2772451522U, // <3,5,0,3>: Cost 3 vuzpl <3,4,5,6>, <0,2,3,5>
+ 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+ 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+ 2960310647U, // <3,5,0,6>: Cost 3 vzipr <1,2,3,0>, <0,4,5,6>
+ 2131525632U, // <3,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0
+ 1698709660U, // <3,5,0,u>: Cost 2 vuzpl <3,4,5,6>, LHS
+ 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+ 2832843572U, // <3,5,1,1>: Cost 3 vuzpr <2,3,4,5>, <1,1,1,1>
+ 2103689217U, // <3,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1759101030U, // <3,5,1,3>: Cost 2 vuzpr <2,3,4,5>, LHS
+ 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+ 2772452352U, // <3,5,1,5>: Cost 3 vuzpl <3,4,5,6>, <1,3,5,7>
+ 3205332992U, // <3,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0
+ 2027212086U, // <3,5,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS
+ 2027212087U, // <3,5,1,u>: Cost 2 vtrnr <2,3,0,1>, RHS
+ 2832843670U, // <3,5,2,0>: Cost 3 vuzpr <2,3,4,5>, <1,2,3,0>
+ 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+ 2832842916U, // <3,5,2,2>: Cost 3 vuzpr <2,3,4,5>, <0,2,0,2>
+ 2131640320U, // <3,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+ 2832842936U, // <3,5,2,4>: Cost 3 vuzpr <2,3,4,5>, <0,2,2,4>
+ 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+ 2959002114U, // <3,5,2,6>: Cost 3 vzipr <1,0,3,2>, <3,4,5,6>
+ 2131673088U, // <3,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0
+ 2131640320U, // <3,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0
+ 2772453922U, // <3,5,3,0>: Cost 3 vuzpl <3,4,5,6>, <3,5,0,2>
+ 2832844454U, // <3,5,3,1>: Cost 3 vuzpr <2,3,4,5>, <2,3,0,1>
+ 3177578497U, // <3,5,3,2>: Cost 3 ins <3,u,3,2>, lane 1
+ 2103844865U, // <3,5,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+ 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+ 1759102670U, // <3,5,3,5>: Cost 2 vuzpr <2,3,4,5>, <2,3,4,5>
+ 2959673858U, // <3,5,3,6>: Cost 3 vzipr <1,1,3,3>, <3,4,5,6>
+ 2021330230U, // <3,5,3,7>: Cost 2 vtrnr <1,3,1,3>, RHS
+ 2021330231U, // <3,5,3,u>: Cost 2 vtrnr <1,3,1,3>, RHS
+ 2832845308U, // <3,5,4,0>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,0>
+ 2732969871U, // <3,5,4,1>: Cost 3 vext3 LHS, <5,4,1,5>
+ 2832844536U, // <3,5,4,2>: Cost 3 vuzpr <2,3,4,5>, <2,4,0,2>
+ 3177660417U, // <3,5,4,3>: Cost 3 ins <3,u,4,3>, lane 1
+ 2832845312U, // <3,5,4,4>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,4>
+ 2131804160U, // <3,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+ 1698712886U, // <3,5,4,6>: Cost 2 vuzpl <3,4,5,6>, RHS
+ 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+ 1698712904U, // <3,5,4,u>: Cost 2 vuzpl <3,4,5,6>, RHS
+ 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+ 2832846074U, // <3,5,5,1>: Cost 3 vuzpr <2,3,4,5>, <4,5,0,1>
+ 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+ 2832845356U, // <3,5,5,3>: Cost 3 vuzpr <2,3,4,5>, <3,5,1,3>
+ 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+ 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+ 2104016897U, // <3,5,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 1759104310U, // <3,5,5,7>: Cost 2 vuzpr <2,3,4,5>, RHS
+ 1759104311U, // <3,5,5,u>: Cost 2 vuzpr <2,3,4,5>, RHS
+ 2131910656U, // <3,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <3,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <3,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <3,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <3,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <3,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <3,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <3,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <3,5,6,u>: Cost 1 ins RHS, lane 0
+ 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+ 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+ 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+ 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+ 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+ 3095397528U, // <3,5,7,6>: Cost 3 vtrnr <1,3,5,7>, <1,5,4,6>
+ 2021657910U, // <3,5,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS
+ 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+ 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+ 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+ 1698715438U, // <3,5,u,2>: Cost 2 vuzpl <3,4,5,6>, LHS
+ 1759101597U, // <3,5,u,3>: Cost 2 vuzpr <2,3,4,5>, LHS
+ 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+ 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+ 1698715802U, // <3,5,u,6>: Cost 2 vuzpl <3,4,5,6>, RHS
+ 1058226176U, // <3,5,u,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <3,5,u,u>: Cost 1 ins RHS, lane 0
+ 2732970264U, // <3,6,0,0>: Cost 3 vext3 LHS, <6,0,0,2>
+ 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+ 2132148224U, // <3,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 3177365505U, // <3,6,0,3>: Cost 3 ins <3,u,0,3>, lane 1
+ 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+ 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+ 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+ 1886571830U, // <3,6,0,7>: Cost 2 vzipr <1,2,3,0>, RHS
+ 1886571831U, // <3,6,0,u>: Cost 2 vzipr <1,2,3,0>, RHS
+ 2720878954U, // <3,6,1,0>: Cost 3 vext3 <6,1,0,3>, <6,1,0,3>
+ 3205955584U, // <3,6,1,1>: Cost 3 ins <u,6,1,1>, lane 0
+ 2103689217U, // <3,6,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 2826731622U, // <3,6,1,3>: Cost 3 vuzpr <1,3,2,6>, LHS
+ 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+ 3205988352U, // <3,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0
+ 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+ 2954349878U, // <3,6,1,7>: Cost 3 vzipr <0,2,3,1>, RHS
+ 2103689217U, // <3,6,1,u>: Cost 2 ins <3,u,1,2>, lane 1
+ 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+ 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+ 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+ 2132303872U, // <3,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+ 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+ 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+ 2826731724U, // <3,6,2,6>: Cost 3 vuzpr <1,3,2,6>, <0,2,4,6>
+ 1885261110U, // <3,6,2,7>: Cost 2 vzipr <1,0,3,2>, RHS
+ 1885261111U, // <3,6,2,u>: Cost 2 vzipr <1,0,3,2>, RHS
+ 3136876642U, // <3,6,3,0>: Cost 3 vtrnr <u,3,1,3>, <5,6,7,0>
+ 3206103040U, // <3,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0
+ 3001478044U, // <3,6,3,2>: Cost 3 vzipr <u,1,3,3>, <4,0,6,2>
+ 2103844865U, // <3,6,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+ 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+ 3206135808U, // <3,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+ 1699457629U, // <3,6,3,6>: Cost 2 vuzpl <3,5,6,7>, <3,5,6,7>
+ 1885932854U, // <3,6,3,7>: Cost 2 vzipr <1,1,3,3>, RHS
+ 1885932855U, // <3,6,3,u>: Cost 2 vzipr <1,1,3,3>, RHS
+ 2732970588U, // <3,6,4,0>: Cost 3 vext3 LHS, <6,4,0,2>
+ 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+ 2732970604U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,0>
+ 2906673714U, // <3,6,4,3>: Cost 3 vzipl <3,4,5,6>, <6,3,4,5>
+ 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+ 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+ 2132475904U, // <3,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 1886604598U, // <3,6,4,7>: Cost 2 vzipr <1,2,3,4>, RHS
+ 1886604599U, // <3,6,4,u>: Cost 2 vzipr <1,2,3,4>, RHS
+ 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+ 3206250496U, // <3,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0
+ 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+ 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+ 3040891442U, // <3,6,5,4>: Cost 3 vtrnl <3,4,5,6>, <6,3,4,5>
+ 3206283264U, // <3,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0
+ 2104016897U, // <3,6,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 2954382646U, // <3,6,5,7>: Cost 3 vzipr <0,2,3,5>, RHS
+ 2104016897U, // <3,6,5,u>: Cost 2 ins <3,u,5,6>, lane 1
+ 2732970748U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,0>
+ 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+ 2732970768U, // <3,6,6,2>: Cost 3 vext3 LHS, <6,6,2,2>
+ 3177807873U, // <3,6,6,3>: Cost 3 ins <3,u,6,3>, lane 1
+ 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+ 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+ 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+ 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+ 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+ 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+ 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+ 1611453282U, // <3,6,7,2>: Cost 2 vext3 LHS, <6,7,2,3>
+ 2968996198U, // <3,6,7,3>: Cost 3 vzipr <2,6,3,7>, <3,2,6,3>
+ 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+ 2968995633U, // <3,6,7,5>: Cost 3 vzipr <2,6,3,7>, <2,4,6,5>
+ 1611453322U, // <3,6,7,6>: Cost 2 vext3 LHS, <6,7,6,7>
+ 1888619830U, // <3,6,7,7>: Cost 2 vzipr <1,5,3,7>, RHS
+ 1888619831U, // <3,6,7,u>: Cost 2 vzipr <1,5,3,7>, RHS
+ 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+ 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+ 2132148224U, // <3,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 2132303872U, // <3,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0
+ 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+ 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+ 2132475904U, // <3,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 1885310262U, // <3,6,u,7>: Cost 2 vzipr <1,0,3,u>, RHS
+ 1885310263U, // <3,6,u,u>: Cost 2 vzipr <1,0,3,u>, RHS
+ 2826960896U, // <3,7,0,0>: Cost 3 vuzpr <1,3,5,7>, <0,0,0,0>
+ 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2826960916U, // <3,7,0,2>: Cost 3 vuzpr <1,3,5,7>, <0,0,2,2>
+ 3002117840U, // <3,7,0,3>: Cost 3 vzipr <u,2,3,0>, <5,1,7,3>
+ 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+ 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+ 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+ 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+ 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+ 2826961716U, // <3,7,1,1>: Cost 3 vuzpr <1,3,5,7>, <1,1,1,1>
+ 2103689217U, // <3,7,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1753219174U, // <3,7,1,3>: Cost 2 vuzpr <1,3,5,7>, LHS
+ 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+ 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+ 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+ 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+ 1753219179U, // <3,7,1,u>: Cost 2 vuzpr <1,3,5,7>, LHS
+ 2826961814U, // <3,7,2,0>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,0>
+ 3206692864U, // <3,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+ 2826961060U, // <3,7,2,2>: Cost 3 vuzpr <1,3,5,7>, <0,2,0,2>
+ 2132967424U, // <3,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+ 2826961818U, // <3,7,2,4>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,4>
+ 2826961072U, // <3,7,2,5>: Cost 3 vuzpr <1,3,5,7>, <0,2,1,5>
+ 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+ 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+ 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+ 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+ 2826962598U, // <3,7,3,1>: Cost 3 vuzpr <1,3,5,7>, <2,3,0,1>
+ 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+ 2103844865U, // <3,7,3,3>: Cost 2 ins <3,u,3,3>, lane 1
+ 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+ 2826962638U, // <3,7,3,5>: Cost 3 vuzpr <1,3,5,7>, <2,3,4,5>
+ 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+ 1753220096U, // <3,7,3,7>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7>
+ 1753220096U, // <3,7,3,u>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7>
+ 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+ 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+ 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+ 3002150608U, // <3,7,4,3>: Cost 3 vzipr <u,2,3,4>, <5,1,7,3>
+ 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+ 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2826961244U, // <3,7,4,6>: Cost 3 vuzpr <1,3,5,7>, <0,4,2,6>
+ 2732971383U, // <3,7,4,7>: Cost 3 vext3 LHS, <7,4,7,5>
+ 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+ 2826963551U, // <3,7,5,0>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,0>
+ 2826963552U, // <3,7,5,1>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,1>
+ 2826962032U, // <3,7,5,2>: Cost 3 vuzpr <1,3,5,7>, <1,5,0,2>
+ 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+ 2826963555U, // <3,7,5,4>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,4>
+ 2826962044U, // <3,7,5,5>: Cost 3 vuzpr <1,3,5,7>, <1,5,1,5>
+ 2104016897U, // <3,7,5,6>: Cost 2 ins <3,u,5,6>, lane 1
+ 1753222454U, // <3,7,5,7>: Cost 2 vuzpr <1,3,5,7>, RHS
+ 1753222455U, // <3,7,5,u>: Cost 2 vuzpr <1,3,5,7>, RHS
+ 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+ 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+ 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+ 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+ 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+ 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+ 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+ 2133295104U, // <3,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+ 2133295104U, // <3,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0
+ 2962362223U, // <3,7,7,0>: Cost 3 vzipr <1,5,3,7>, <5,3,7,0>
+ 2826965109U, // <3,7,7,1>: Cost 3 vuzpr <1,3,5,7>, <5,7,0,1>
+ 2968998474U, // <3,7,7,2>: Cost 3 vzipr <2,6,3,7>, <6,3,7,2>
+ 2826963662U, // <3,7,7,3>: Cost 3 vuzpr <1,3,5,7>, <3,7,1,3>
+ 2962362227U, // <3,7,7,4>: Cost 3 vzipr <1,5,3,7>, <5,3,7,4>
+ 2826965149U, // <3,7,7,5>: Cost 3 vuzpr <1,3,5,7>, <5,7,4,5>
+ 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+ 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+ 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+ 2826962300U, // <3,7,u,0>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,0>
+ 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+ 2103689217U, // <3,7,u,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1753219741U, // <3,7,u,3>: Cost 2 vuzpr <1,3,5,7>, LHS
+ 2826962304U, // <3,7,u,4>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,4>
+ 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+ 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+ 1753222697U, // <3,7,u,7>: Cost 2 vuzpr <1,3,5,7>, RHS
+ 1753219746U, // <3,7,u,u>: Cost 2 vuzpr <1,3,5,7>, LHS
+ 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+ 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+ 1696243814U, // <3,u,0,2>: Cost 2 vuzpl <3,0,u,2>, LHS
+ 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+ 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+ 1829951642U, // <3,u,0,5>: Cost 2 vzipl <3,0,1,2>, RHS
+ 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+ 1886571848U, // <3,u,0,7>: Cost 2 vzipr <1,2,3,0>, RHS
+ 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+ 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+ 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+ 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+ 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+ 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+ 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+ 1964169370U, // <3,u,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS
+ 2027212329U, // <3,u,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS
+ 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1659672428U, // <3,u,2,0>: Cost 2 vext3 LHS, <u,2,0,2>
+ 2128969728U, // <3,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+ 1055244288U, // <3,u,2,3>: Cost 1 ins LHS, lane 0
+ 1659672468U, // <3,u,2,4>: Cost 2 vext3 LHS, <u,2,4,6>
+ 2129002496U, // <3,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+ 1885261128U, // <3,u,2,7>: Cost 2 vzipr <1,0,3,2>, RHS
+ 1055244288U, // <3,u,2,u>: Cost 1 ins LHS, lane 0
+ 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+ 1616541639U, // <3,u,3,1>: Cost 2 vext3 LHS, <u,3,1,3>
+ 1966315310U, // <3,u,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS
+ 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+ 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+ 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+ 1966315674U, // <3,u,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS
+ 1885932872U, // <3,u,3,7>: Cost 2 vzipr <1,1,3,3>, RHS
+ 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+ 2960344003U, // <3,u,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,u,0>
+ 1832933166U, // <3,u,4,1>: Cost 2 vzipl <3,4,5,6>, LHS
+ 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1886601372U, // <3,u,4,3>: Cost 2 vzipr <1,2,3,4>, LHS
+ 1886602138U, // <3,u,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4>
+ 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+ 1696247094U, // <3,u,4,6>: Cost 2 vuzpl <3,0,u,2>, RHS
+ 1886604616U, // <3,u,4,7>: Cost 2 vzipr <1,2,3,4>, RHS
+ 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+ 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+ 2128527360U, // <3,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+ 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+ 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+ 2027538126U, // <3,u,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5>
+ 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+ 1752935734U, // <3,u,5,7>: Cost 2 vuzpr <1,3,1,u>, RHS
+ 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1663875248U, // <3,u,6,0>: Cost 2 vext3 LHS, <u,6,0,2>
+ 2131918848U, // <3,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2128609280U, // <3,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+ 1663875288U, // <3,u,6,4>: Cost 2 vext3 LHS, <u,6,4,6>
+ 2131951616U, // <3,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131296256U, // <3,u,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 1058226176U, // <3,u,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <3,u,6,u>: Cost 1 ins RHS, lane 0
+ 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+ 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+ 2098896898U, // <3,u,7,2>: Cost 2 ins <3,0,u,2>, lane 2
+ 2021655197U, // <3,u,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS
+ 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+ 1659230515U, // <3,u,7,5>: Cost 2 vext3 LHS, <u,7,5,7>
+ 2131369984U, // <3,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 2021658153U, // <3,u,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS
+ 2021655202U, // <3,u,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS
+ 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+ 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+ 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+ 1055244288U, // <3,u,u,3>: Cost 1 ins LHS, lane 0
+ 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+ 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+ 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+ 1058226176U, // <3,u,u,7>: Cost 1 ins RHS, lane 0
+ 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+ 2128150528U, // <4,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+ 2104860674U, // <4,0,0,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 1705607270U, // <4,0,0,2>: Cost 2 vuzpl <4,6,0,2>, LHS
+ 3178070019U, // <4,0,0,3>: Cost 3 ins <4,0,0,u>, lane 3
+ 2909946194U, // <4,0,0,4>: Cost 3 vzipl <4,0,5,1>, <0,4,1,5>
+ 3178070019U, // <4,0,0,5>: Cost 3 ins <4,0,0,u>, lane 3
+ 3183362049U, // <4,0,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+ 2109628417U, // <4,0,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 1705607324U, // <4,0,0,u>: Cost 2 vuzpl <4,6,0,2>, LHS
+ 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+ 2128232448U, // <4,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2833612902U, // <4,0,1,3>: Cost 3 vuzpr <2,4,6,0>, LHS
+ 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+ 2779350016U, // <4,0,1,5>: Cost 3 vuzpl <4,6,0,2>, <1,3,5,7>
+ 3202015232U, // <4,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0
+ 2109702145U, // <4,0,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+ 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+ 2104860674U, // <4,0,2,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 2128314368U, // <4,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2104918021U, // <4,0,2,3>: Cost 2 ins <4,0,u,u>, lane 5
+ 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+ 3044622465U, // <4,0,2,5>: Cost 3 vtrnl <4,1,2,3>, <0,1,5,3>
+ 2833613004U, // <4,0,2,6>: Cost 3 vuzpr <2,4,6,0>, <0,2,4,6>
+ 2109775873U, // <4,0,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 2104860674U, // <4,0,2,u>: Cost 2 ins <4,0,u,1>, lane 2
+ 3202113536U, // <4,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+ 2104860674U, // <4,0,3,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 2128388096U, // <4,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 2779351452U, // <4,0,3,3>: Cost 3 vuzpl <4,6,0,2>, <3,3,3,3>
+ 3178627074U, // <4,0,3,4>: Cost 3 ins <4,0,u,4>, lane 2
+ 2839512782U, // <4,0,3,5>: Cost 3 vuzpr <3,4,5,0>, <2,3,4,5>
+ 3178643458U, // <4,0,3,6>: Cost 3 ins <4,0,u,6>, lane 2
+ 2109849601U, // <4,0,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2104860674U, // <4,0,3,u>: Cost 2 ins <4,0,u,1>, lane 2
+ 1705610572U, // <4,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2>
+ 2104860674U, // <4,0,4,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 1974370406U, // <4,0,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS
+ 3178364931U, // <4,0,4,3>: Cost 3 ins <4,0,4,u>, lane 3
+ 2109898753U, // <4,0,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 2104918021U, // <4,0,4,5>: Cost 2 ins <4,0,u,u>, lane 5
+ 1705610550U, // <4,0,4,6>: Cost 2 vuzpl <4,6,0,2>, RHS
+ 2109923329U, // <4,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 1705610568U, // <4,0,4,u>: Cost 2 vuzpl <4,6,0,2>, RHS
+ 1839644672U, // <4,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0>
+ 765902950U, // <4,0,5,1>: Cost 1 vzipl RHS, LHS
+ 1839644836U, // <4,0,5,2>: Cost 2 vzipl RHS, <0,2,0,2>
+ 2104696835U, // <4,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3
+ 1839645010U, // <4,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5>
+ 2109980673U, // <4,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 2104696835U, // <4,0,5,6>: Cost 2 ins <4,0,5,u>, lane 3
+ 2104696835U, // <4,0,5,7>: Cost 2 ins <4,0,5,u>, lane 3
+ 765903517U, // <4,0,5,u>: Cost 1 vzipl RHS, LHS
+ 1973862400U, // <4,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+ 1973862410U, // <4,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1>
+ 900120678U, // <4,0,6,2>: Cost 1 vtrnl RHS, LHS
+ 2104770563U, // <4,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3
+ 1973862604U, // <4,0,6,4>: Cost 2 vtrnl RHS, <0,2,4,6>
+ 2104770563U, // <4,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3
+ 2110062593U, // <4,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <4,0,6,7>: Cost 1 ins RHS, lane 1
+ 900120732U, // <4,0,6,u>: Cost 1 vtrnl RHS, LHS
+ 3202408448U, // <4,0,7,0>: Cost 3 ins <u,0,7,0>, lane 0
+ 2104860674U, // <4,0,7,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 2104868866U, // <4,0,7,2>: Cost 2 ins <4,0,u,2>, lane 2
+ 3114049557U, // <4,0,7,3>: Cost 3 vtrnr <4,4,6,7>, <0,0,2,3>
+ 3178627074U, // <4,0,7,4>: Cost 3 ins <4,0,u,4>, lane 2
+ 2779354470U, // <4,0,7,5>: Cost 3 vuzpl <4,6,0,2>, <7,4,5,6>
+ 2779354473U, // <4,0,7,6>: Cost 3 vuzpl <4,6,0,2>, <7,4,6,0>
+ 2110144513U, // <4,0,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+ 2104860674U, // <4,0,7,u>: Cost 2 ins <4,0,u,1>, lane 2
+ 1974009856U, // <4,0,u,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+ 767893606U, // <4,0,u,1>: Cost 1 vzipl RHS, LHS
+ 900268134U, // <4,0,u,2>: Cost 1 vtrnl RHS, LHS
+ 2104918021U, // <4,0,u,3>: Cost 2 ins <4,0,u,u>, lane 5
+ 1974010060U, // <4,0,u,4>: Cost 2 vtrnl RHS, <0,2,4,6>
+ 2104918021U, // <4,0,u,5>: Cost 2 ins <4,0,u,u>, lane 5
+ 1705613466U, // <4,0,u,6>: Cost 2 vuzpl <4,6,0,2>, RHS
+ 1036328961U, // <4,0,u,7>: Cost 1 ins RHS, lane 1
+ 900268188U, // <4,0,u,u>: Cost 1 vtrnl RHS, LHS
+ 2600640614U, // <4,1,0,0>: Cost 3 vext1 <u,4,1,0>, LHS
+ 2128822272U, // <4,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+ 2109587457U, // <4,1,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+ 2128838656U, // <4,1,0,3>: Cost 2 ins <u,1,0,3>, lane 0
+ 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+ 3047785472U, // <4,1,0,5>: Cost 3 vtrnl <4,6,0,2>, <1,3,5,7>
+ 3183362049U, // <4,1,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+ 2109628417U, // <4,1,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 2109587457U, // <4,1,0,u>: Cost 2 ins <4,u,0,2>, lane 1
+ 3202629632U, // <4,1,1,0>: Cost 3 ins <u,1,1,0>, lane 0
+ 2128896000U, // <4,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+ 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+ 2128912384U, // <4,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+ 3202662400U, // <4,1,1,4>: Cost 3 ins <u,1,1,4>, lane 0
+ 2958401874U, // <4,1,1,5>: Cost 3 vzipr <0,u,4,1>, <0,4,1,5>
+ 2778801323U, // <4,1,1,6>: Cost 3 vuzpl <4,5,1,7>, <1,5,6,7>
+ 2109702145U, // <4,1,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 2128896000U, // <4,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+ 2128961536U, // <4,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <4,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <4,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <4,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <4,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <4,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <4,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <4,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <4,1,2,u>: Cost 1 ins LHS, lane 0
+ 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+ 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+ 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+ 2129059840U, // <4,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+ 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+ 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+ 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+ 2109849601U, // <4,1,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2129059840U, // <4,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+ 2600673382U, // <4,1,4,0>: Cost 3 vext1 <u,4,1,4>, LHS
+ 1705061641U, // <4,1,4,1>: Cost 2 vuzpl <4,5,1,7>, <4,5,1,7>
+ 2912641946U, // <4,1,4,2>: Cost 3 vzipl <4,4,5,6>, <1,2,3,4>
+ 2040135782U, // <4,1,4,3>: Cost 2 vtrnr <4,4,4,4>, LHS
+ 2109898753U, // <4,1,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 2129149952U, // <4,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+ 2109915137U, // <4,1,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 2109923329U, // <4,1,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 2109915137U, // <4,1,4,u>: Cost 2 ins <4,u,4,6>, lane 1
+ 1479164242U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, <0,4,1,5>
+ 1839645492U, // <4,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1>
+ 1839645590U, // <4,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0>
+ 2016034918U, // <4,1,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS
+ 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+ 1839645840U, // <4,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7>
+ 3089776763U, // <4,1,5,6>: Cost 3 vtrnr <0,4,1,5>, <0,1,4,6>
+ 2109997057U, // <4,1,5,7>: Cost 2 ins <4,u,5,7>, lane 1
+ 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+ 2110013441U, // <4,1,6,0>: Cost 2 ins <4,u,6,0>, lane 1
+ 1973863220U, // <4,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+ 2110029825U, // <4,1,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2016116838U, // <4,1,6,3>: Cost 2 vtrnr <0,4,2,6>, LHS
+ 2110046209U, // <4,1,6,4>: Cost 2 ins <4,u,6,4>, lane 1
+ 1973863424U, // <4,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+ 2110062593U, // <4,1,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <4,1,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,1,6,u>: Cost 1 ins RHS, lane 1
+ 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+ 3203080192U, // <4,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0
+ 3203088384U, // <4,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0
+ 2129354752U, // <4,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+ 2664666470U, // <4,1,7,4>: Cost 3 vext2 <7,u,4,1>, <7,4,5,6>
+ 3203112960U, // <4,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0
+ 3114049641U, // <4,1,7,6>: Cost 3 vtrnr <4,4,6,7>, <0,1,2,6>
+ 2110144513U, // <4,1,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+ 2129354752U, // <4,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0
+ 1479188821U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, <0,4,1,u>
+ 1974010676U, // <4,1,u,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+ 1841636246U, // <4,1,u,2>: Cost 2 vzipl RHS, <1,2,3,0>
+ 1055244288U, // <4,1,u,3>: Cost 1 ins LHS, lane 0
+ 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+ 1974010880U, // <4,1,u,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+ 2109915137U, // <4,1,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 1036328961U, // <4,1,u,7>: Cost 1 ins RHS, lane 1
+ 1055244288U, // <4,1,u,u>: Cost 1 ins LHS, lane 0
+ 3047786150U, // <4,2,0,0>: Cost 3 vtrnl <4,6,0,2>, <2,3,0,1>
+ 2109579265U, // <4,2,0,1>: Cost 2 ins <4,u,0,1>, lane 1
+ 2129494016U, // <4,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 2967019622U, // <4,2,0,3>: Cost 3 vzipr <2,3,4,0>, LHS
+ 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+ 2909947747U, // <4,2,0,5>: Cost 3 vzipl <4,0,5,1>, <2,5,3,1>
+ 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+ 2109628417U, // <4,2,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 2129494016U, // <4,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0
+ 3203293184U, // <4,2,1,0>: Cost 3 ins <u,2,1,0>, lane 0
+ 3203301376U, // <4,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+ 3203309568U, // <4,2,1,2>: Cost 3 ins <u,2,1,2>, lane 0
+ 2821242982U, // <4,2,1,3>: Cost 3 vuzpr <0,4,0,2>, LHS
+ 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+ 3203334144U, // <4,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0
+ 3203342336U, // <4,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0
+ 2109702145U, // <4,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 2109702145U, // <4,2,1,u>: Cost 2 ins <4,u,1,7>, lane 1
+ 2229208824U, // <4,2,2,0>: Cost 3 vrev <2,4,0,2>
+ 2911397400U, // <4,2,2,1>: Cost 3 vzipl <4,2,6,7>, <2,1,2,3>
+ 2129641472U, // <4,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+ 2129649664U, // <4,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0
+ 2697954940U, // <4,2,2,4>: Cost 3 vext3 <2,2,4,4>, <2,2,4,4>
+ 2911397764U, // <4,2,2,5>: Cost 3 vzipl <4,2,6,7>, <2,5,6,7>
+ 2821243084U, // <4,2,2,6>: Cost 3 vuzpr <0,4,0,2>, <0,2,4,6>
+ 2109775873U, // <4,2,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 2129641472U, // <4,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0
+ 2129698816U, // <4,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 2229290754U, // <4,2,3,1>: Cost 3 vrev <2,4,1,3>
+ 3203457024U, // <4,2,3,2>: Cost 3 ins <u,2,3,2>, lane 0
+ 2129723392U, // <4,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+ 2129731584U, // <4,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 2833188558U, // <4,2,3,5>: Cost 3 vuzpr <2,4,0,2>, <2,3,4,5>
+ 3203489792U, // <4,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0
+ 2109849601U, // <4,2,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2129698816U, // <4,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0
+ 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+ 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+ 1702448074U, // <4,2,4,2>: Cost 2 vuzpl <4,1,2,3>, <4,1,2,3>
+ 1905918054U, // <4,2,4,3>: Cost 2 vzipr <4,4,4,4>, LHS
+ 2109898753U, // <4,2,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 2109906945U, // <4,2,4,5>: Cost 2 ins <4,u,4,5>, lane 1
+ 2129821696U, // <4,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 2109923329U, // <4,2,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 2129821696U, // <4,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0
+ 3089777558U, // <4,2,5,0>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,0>
+ 2109947905U, // <4,2,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+ 1839646312U, // <4,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2>
+ 1893318758U, // <4,2,5,3>: Cost 2 vzipr <2,3,4,5>, LHS
+ 3089777562U, // <4,2,5,4>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,4>
+ 2109980673U, // <4,2,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 1839646650U, // <4,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7>
+ 2109997057U, // <4,2,5,7>: Cost 2 ins <4,u,5,7>, lane 1
+ 1893318763U, // <4,2,5,u>: Cost 2 vzipr <2,3,4,5>, LHS
+ 1479246172U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, <0,4,2,6>
+ 2110021633U, // <4,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+ 1973864040U, // <4,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+ 1880719462U, // <4,2,6,3>: Cost 2 vzipr <0,2,4,6>, LHS
+ 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+ 2110054401U, // <4,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+ 2110062593U, // <4,2,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <4,2,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,2,6,u>: Cost 1 ins RHS, lane 1
+ 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+ 3203743744U, // <4,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+ 3203751936U, // <4,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0
+ 2130018304U, // <4,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0
+ 3102032794U, // <4,2,7,4>: Cost 3 vtrnr <2,4,5,7>, <1,2,3,4>
+ 2229618474U, // <4,2,7,5>: Cost 3 vrev <2,4,5,7>
+ 3203784704U, // <4,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0
+ 2110144513U, // <4,2,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+ 2130018304U, // <4,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0
+ 1479262558U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, <0,4,2,u>
+ 2109947905U, // <4,2,u,1>: Cost 2 ins <4,u,5,1>, lane 1
+ 1974011496U, // <4,2,u,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+ 1880735846U, // <4,2,u,3>: Cost 2 vzipr <0,2,4,u>, LHS
+ 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+ 2109980673U, // <4,2,u,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 1841637306U, // <4,2,u,6>: Cost 2 vzipl RHS, <2,6,3,7>
+ 1036328961U, // <4,2,u,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,2,u,u>: Cost 1 ins RHS, lane 1
+ 3203883008U, // <4,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+ 2130149376U, // <4,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+ 2109587457U, // <4,3,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+ 3047786908U, // <4,3,0,3>: Cost 3 vtrnl <4,6,0,2>, <3,3,3,3>
+ 2967020442U, // <4,3,0,4>: Cost 3 vzipr <2,3,4,0>, <1,2,3,4>
+ 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+ 3183362049U, // <4,3,0,6>: Cost 3 ins <4,u,0,6>, lane 1
+ 2109628417U, // <4,3,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 2130149376U, // <4,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+ 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+ 3203964928U, // <4,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0
+ 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+ 2130239488U, // <4,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+ 2967028634U, // <4,3,1,4>: Cost 3 vzipr <2,3,4,1>, <1,2,3,4>
+ 3203997696U, // <4,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0
+ 2821398633U, // <4,3,1,6>: Cost 3 vuzpr <0,4,2,3>, <0,1,2,6>
+ 2109702145U, // <4,3,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 2130239488U, // <4,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+ 3204030464U, // <4,3,2,0>: Cost 3 ins <u,3,2,0>, lane 0
+ 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+ 3204046848U, // <4,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0
+ 2130313216U, // <4,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+ 2833269658U, // <4,3,2,4>: Cost 3 vuzpr <2,4,1,3>, <1,2,3,4>
+ 3101624014U, // <4,3,2,5>: Cost 3 vtrnr <2,4,0,2>, <2,3,4,5>
+ 3204079616U, // <4,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0
+ 2109775873U, // <4,3,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 2130313216U, // <4,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+ 3204104192U, // <4,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0
+ 2779564182U, // <4,3,3,1>: Cost 3 vuzpl <4,6,3,1>, <3,0,1,2>
+ 2636810580U, // <4,3,3,2>: Cost 3 vext2 <3,2,4,3>, <3,2,4,3>
+ 2130386944U, // <4,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+ 2965717914U, // <4,3,3,4>: Cost 3 vzipr <2,1,4,3>, <1,2,3,4>
+ 2779597314U, // <4,3,3,5>: Cost 3 vuzpl <4,6,3,5>, <3,4,5,6>
+ 2778950237U, // <4,3,3,6>: Cost 3 vuzpl <4,5,3,7>, <3,5,6,7>
+ 2109849601U, // <4,3,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2130386944U, // <4,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+ 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+ 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+ 3183624193U, // <4,3,4,2>: Cost 3 ins <4,u,4,2>, lane 1
+ 1747657049U, // <4,3,4,3>: Cost 2 vuzpr <0,4,2,3>, <0,4,2,3>
+ 2109898753U, // <4,3,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 2130477056U, // <4,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+ 2109915137U, // <4,3,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 2109923329U, // <4,3,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 2130477056U, // <4,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+ 1839646870U, // <4,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2>
+ 2109947905U, // <4,3,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+ 2967061238U, // <4,3,5,2>: Cost 3 vzipr <2,3,4,5>, <1,0,3,2>
+ 1839647132U, // <4,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3>
+ 1839647234U, // <4,3,5,4>: Cost 2 vzipl RHS, <3,4,5,6>
+ 2109980673U, // <4,3,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 2913389176U, // <4,3,5,6>: Cost 3 vzipl RHS, <3,6,0,7>
+ 2130567168U, // <4,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+ 1839647518U, // <4,3,5,u>: Cost 2 vzipl RHS, <3,u,1,2>
+ 2110013441U, // <4,3,6,0>: Cost 2 ins <4,u,6,0>, lane 1
+ 1973864598U, // <4,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+ 2110029825U, // <4,3,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 1973864860U, // <4,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+ 2110046209U, // <4,3,6,4>: Cost 2 ins <4,u,6,4>, lane 1
+ 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+ 2110062593U, // <4,3,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <4,3,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,3,6,u>: Cost 1 ins RHS, lane 1
+ 3204399104U, // <4,3,7,0>: Cost 3 ins <u,3,7,0>, lane 0
+ 3204407296U, // <4,3,7,1>: Cost 3 ins <u,3,7,1>, lane 0
+ 2660701368U, // <4,3,7,2>: Cost 3 vext2 <7,2,4,3>, <7,2,4,3>
+ 3204423680U, // <4,3,7,3>: Cost 3 ins <u,3,7,3>, lane 0
+ 2968404890U, // <4,3,7,4>: Cost 3 vzipr <2,5,4,7>, <1,2,3,4>
+ 3204440064U, // <4,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0
+ 2235664908U, // <4,3,7,6>: Cost 3 vrev <3,4,6,7>
+ 2110144513U, // <4,3,7,7>: Cost 2 ins <4,u,7,7>, lane 1
+ 2110144513U, // <4,3,7,u>: Cost 2 ins <4,u,7,7>, lane 1
+ 1841637526U, // <4,3,u,0>: Cost 2 vzipl RHS, <3,0,1,2>
+ 1974012054U, // <4,3,u,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+ 2109587457U, // <4,3,u,2>: Cost 2 ins <4,u,0,2>, lane 1
+ 1974012316U, // <4,3,u,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+ 1841637890U, // <4,3,u,4>: Cost 2 vzipl RHS, <3,4,5,6>
+ 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+ 2109915137U, // <4,3,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 1036328961U, // <4,3,u,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,3,u,u>: Cost 1 ins RHS, lane 1
+ 1974046028U, // <4,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2>
+ 2107572229U, // <4,4,0,1>: Cost 2 ins <4,4,u,u>, lane 5
+ 1705934950U, // <4,4,0,2>: Cost 2 vuzpl <4,6,4,6>, LHS
+ 3180724227U, // <4,4,0,3>: Cost 3 ins <4,4,0,u>, lane 3
+ 2107539458U, // <4,4,0,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 2107547650U, // <4,4,0,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 1974046006U, // <4,4,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS
+ 2109628417U, // <4,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 1974046024U, // <4,4,0,u>: Cost 2 vtrnl <4,6,0,2>, RHS
+ 3204620288U, // <4,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0
+ 1836665802U, // <4,4,1,1>: Cost 2 vzipl <4,1,2,3>, <4,1,2,3>
+ 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+ 1771700326U, // <4,4,1,3>: Cost 2 vuzpr <4,4,4,4>, LHS
+ 2107539458U, // <4,4,1,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 2130919424U, // <4,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+ 2107555842U, // <4,4,1,6>: Cost 2 ins <4,4,u,6>, lane 2
+ 2109702145U, // <4,4,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 2130919424U, // <4,4,1,u>: Cost 2 ins <u,4,1,5>, lane 0
+ 2779678374U, // <4,4,2,0>: Cost 3 vuzpl <4,6,4,6>, <2,3,0,1>
+ 3044625673U, // <4,4,2,1>: Cost 3 vtrnl <4,1,2,3>, <4,5,1,7>
+ 1970883530U, // <4,4,2,2>: Cost 2 vtrnl <4,1,2,3>, <4,1,2,3>
+ 2107572229U, // <4,4,2,3>: Cost 2 ins <4,4,u,u>, lane 5
+ 2107539458U, // <4,4,2,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 2107547650U, // <4,4,2,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 2131001344U, // <4,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 2109775873U, // <4,4,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 2107572229U, // <4,4,2,u>: Cost 2 ins <4,4,u,u>, lane 5
+ 3181248514U, // <4,4,3,0>: Cost 3 ins <4,4,u,0>, lane 2
+ 2779678870U, // <4,4,3,1>: Cost 3 vuzpl <4,6,4,6>, <3,0,1,2>
+ 3181264898U, // <4,4,3,2>: Cost 3 ins <4,4,u,2>, lane 2
+ 1880031352U, // <4,4,3,3>: Cost 2 vzipr <0,1,4,3>, <0,1,4,3>
+ 2107539458U, // <4,4,3,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 2107547650U, // <4,4,3,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 2107555842U, // <4,4,3,6>: Cost 2 ins <4,4,u,6>, lane 2
+ 2109849601U, // <4,4,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2107547650U, // <4,4,3,u>: Cost 2 ins <4,4,u,5>, lane 2
+ 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+ 2107277315U, // <4,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3
+ 2107277315U, // <4,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3
+ 2107277315U, // <4,4,4,3>: Cost 2 ins <4,4,4,u>, lane 3
+ 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+ 2107547650U, // <4,4,4,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 1705938230U, // <4,4,4,6>: Cost 2 vuzpl <4,6,4,6>, RHS
+ 2109923329U, // <4,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+ 1839647634U, // <4,4,5,0>: Cost 2 vzipl RHS, <4,0,5,1>
+ 2109947905U, // <4,4,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+ 2107351043U, // <4,4,5,2>: Cost 2 ins <4,4,5,u>, lane 3
+ 2107351043U, // <4,4,5,3>: Cost 2 ins <4,4,5,u>, lane 3
+ 1839647952U, // <4,4,5,4>: Cost 2 vzipl RHS, <4,4,4,4>
+ 765906230U, // <4,4,5,5>: Cost 1 vzipl RHS, RHS
+ 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 2107351043U, // <4,4,5,7>: Cost 2 ins <4,4,5,u>, lane 3
+ 765906473U, // <4,4,5,u>: Cost 1 vzipl RHS, RHS
+ 1973865804U, // <4,4,6,0>: Cost 2 vtrnl RHS, <4,6,0,2>
+ 2107424771U, // <4,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3
+ 2110029825U, // <4,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2107424771U, // <4,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3
+ 1973865680U, // <4,4,6,4>: Cost 2 vtrnl RHS, <4,4,4,4>
+ 1973865362U, // <4,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1>
+ 900123958U, // <4,4,6,6>: Cost 1 vtrnl RHS, RHS
+ 1036328961U, // <4,4,6,7>: Cost 1 ins RHS, lane 1
+ 900123976U, // <4,4,6,u>: Cost 1 vtrnl RHS, RHS
+ 3181248514U, // <4,4,7,0>: Cost 3 ins <4,4,u,0>, lane 2
+ 2779681786U, // <4,4,7,1>: Cost 3 vuzpl <4,6,4,6>, <7,0,1,2>
+ 3181264898U, // <4,4,7,2>: Cost 3 ins <4,4,u,2>, lane 2
+ 2845442636U, // <4,4,7,3>: Cost 3 vuzpr <4,4,4,4>, <0,7,2,3>
+ 2107539458U, // <4,4,7,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 2107547650U, // <4,4,7,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 2131369984U, // <4,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 2040311013U, // <4,4,7,7>: Cost 2 vtrnr <4,4,6,7>, <4,4,6,7>
+ 2107547650U, // <4,4,7,u>: Cost 2 ins <4,4,u,5>, lane 2
+ 1974013260U, // <4,4,u,0>: Cost 2 vtrnl RHS, <4,6,0,2>
+ 2107572229U, // <4,4,u,1>: Cost 2 ins <4,4,u,u>, lane 5
+ 1705940782U, // <4,4,u,2>: Cost 2 vuzpl <4,6,4,6>, LHS
+ 2107572229U, // <4,4,u,3>: Cost 2 ins <4,4,u,u>, lane 5
+ 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+ 767896886U, // <4,4,u,5>: Cost 1 vzipl RHS, RHS
+ 900271414U, // <4,4,u,6>: Cost 1 vtrnl RHS, RHS
+ 1036328961U, // <4,4,u,7>: Cost 1 ins RHS, lane 1
+ 900271432U, // <4,4,u,u>: Cost 1 vtrnl RHS, RHS
+ 2108170242U, // <4,5,0,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 1034493957U, // <4,5,0,1>: Cost 1 ins RHS, lane 5
+ 1707294822U, // <4,5,0,2>: Cost 2 vuzpl <4,u,5,1>, LHS
+ 2108194818U, // <4,5,0,3>: Cost 2 ins <4,5,u,3>, lane 2
+ 2108203010U, // <4,5,0,4>: Cost 2 ins <4,5,u,4>, lane 2
+ 2108211202U, // <4,5,0,5>: Cost 2 ins <4,5,u,5>, lane 2
+ 2108219394U, // <4,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2
+ 1034485762U, // <4,5,0,7>: Cost 1 ins RHS, lane 2
+ 1034493957U, // <4,5,0,u>: Cost 1 ins RHS, lane 5
+ 2108170242U, // <4,5,1,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 2133540868U, // <4,5,1,1>: Cost 2 ins <u,u,1,1>, lane 4
+ 2133549060U, // <4,5,1,2>: Cost 2 ins <u,u,1,2>, lane 4
+ 1747599462U, // <4,5,1,3>: Cost 2 vuzpr <0,4,1,5>, LHS
+ 2108203010U, // <4,5,1,4>: Cost 2 ins <4,5,u,4>, lane 2
+ 2133573636U, // <4,5,1,5>: Cost 2 ins <u,u,1,5>, lane 4
+ 2108219394U, // <4,5,1,6>: Cost 2 ins <4,5,u,6>, lane 2
+ 1034485762U, // <4,5,1,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,5,1,u>: Cost 1 ins RHS, lane 2
+ 2108170242U, // <4,5,2,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 2108178434U, // <4,5,2,1>: Cost 2 ins <4,5,u,1>, lane 2
+ 2133622788U, // <4,5,2,2>: Cost 2 ins <u,u,2,2>, lane 4
+ 1059889156U, // <4,5,2,3>: Cost 1 ins LHS, lane 4
+ 2108203010U, // <4,5,2,4>: Cost 2 ins <4,5,u,4>, lane 2
+ 2108211202U, // <4,5,2,5>: Cost 2 ins <4,5,u,5>, lane 2
+ 2133655556U, // <4,5,2,6>: Cost 2 ins <u,u,2,6>, lane 4
+ 1034485762U, // <4,5,2,7>: Cost 1 ins RHS, lane 2
+ 1059889156U, // <4,5,2,u>: Cost 1 ins LHS, lane 4
+ 2133680132U, // <4,5,3,0>: Cost 2 ins <u,u,3,0>, lane 4
+ 2108178434U, // <4,5,3,1>: Cost 2 ins <4,5,u,1>, lane 2
+ 2133696516U, // <4,5,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+ 2133704708U, // <4,5,3,3>: Cost 2 ins <u,u,3,3>, lane 4
+ 2133712900U, // <4,5,3,4>: Cost 2 ins <u,u,3,4>, lane 4
+ 2108211202U, // <4,5,3,5>: Cost 2 ins <4,5,u,5>, lane 2
+ 2108219394U, // <4,5,3,6>: Cost 2 ins <4,5,u,6>, lane 2
+ 1034485762U, // <4,5,3,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,5,3,u>: Cost 1 ins RHS, lane 2
+ 2108170242U, // <4,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 2108178434U, // <4,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2
+ 2108186626U, // <4,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2
+ 2108194818U, // <4,5,4,3>: Cost 2 ins <4,5,u,3>, lane 2
+ 2109898753U, // <4,5,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 1034493957U, // <4,5,4,5>: Cost 1 ins RHS, lane 5
+ 1707298102U, // <4,5,4,6>: Cost 2 vuzpl <4,u,5,1>, RHS
+ 1034485762U, // <4,5,4,7>: Cost 1 ins RHS, lane 2
+ 1034493957U, // <4,5,4,u>: Cost 1 ins RHS, lane 5
+ 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+ 1839656656U, // <4,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3>
+ 2108186626U, // <4,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2
+ 2108194818U, // <4,5,5,3>: Cost 2 ins <4,5,u,3>, lane 2
+ 1839648710U, // <4,5,5,4>: Cost 2 vzipl RHS, <5,4,7,6>
+ 1839648772U, // <4,5,5,5>: Cost 2 vzipl RHS, <5,5,5,5>
+ 1839648866U, // <4,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0>
+ 1034485762U, // <4,5,5,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,5,5,u>: Cost 1 ins RHS, lane 2
+ 1034346499U, // <4,5,6,0>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,1>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,2>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,3>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,4>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,5>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,5,6,6>: Cost 1 ins RHS, lane 3
+ 27705344U, // <4,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,6,u>: Cost 0 copy RHS
+ 2133975044U, // <4,5,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+ 2108178434U, // <4,5,7,1>: Cost 2 ins <4,5,u,1>, lane 2
+ 2108186626U, // <4,5,7,2>: Cost 2 ins <4,5,u,2>, lane 2
+ 2133999620U, // <4,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+ 2134007812U, // <4,5,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+ 2108211202U, // <4,5,7,5>: Cost 2 ins <4,5,u,5>, lane 2
+ 2134024196U, // <4,5,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 1034485762U, // <4,5,7,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,5,7,u>: Cost 1 ins RHS, lane 2
+ 1034346499U, // <4,5,u,0>: Cost 1 ins RHS, lane 3
+ 1034493957U, // <4,5,u,1>: Cost 1 ins RHS, lane 5
+ 1034346499U, // <4,5,u,2>: Cost 1 ins RHS, lane 3
+ 1059889156U, // <4,5,u,3>: Cost 1 ins LHS, lane 4
+ 1034346499U, // <4,5,u,4>: Cost 1 ins RHS, lane 3
+ 1034493957U, // <4,5,u,5>: Cost 1 ins RHS, lane 5
+ 1034346499U, // <4,5,u,6>: Cost 1 ins RHS, lane 3
+ 27705344U, // <4,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,5,u,u>: Cost 0 copy RHS
+ 1705426944U, // <4,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+ 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 631685222U, // <4,6,0,2>: Cost 1 vuzpl RHS, LHS
+ 2108309507U, // <4,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+ 1705427148U, // <4,6,0,4>: Cost 2 vuzpl RHS, <0,2,4,6>
+ 2108309507U, // <4,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3
+ 2108882946U, // <4,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 2108309507U, // <4,6,0,7>: Cost 2 ins <4,6,0,u>, lane 3
+ 631685276U, // <4,6,0,u>: Cost 1 vuzpl RHS, LHS
+ 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+ 1705427764U, // <4,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+ 2108850178U, // <4,6,1,2>: Cost 2 ins <4,6,u,2>, lane 2
+ 1747681382U, // <4,6,1,3>: Cost 2 vuzpr <0,4,2,6>, LHS
+ 2779169619U, // <4,6,1,4>: Cost 3 vuzpl RHS, <1,1,4,5>
+ 1705427968U, // <4,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+ 2108882946U, // <4,6,1,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 2109702145U, // <4,6,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 1747681387U, // <4,6,1,u>: Cost 2 vuzpr <0,4,2,6>, LHS
+ 1705428646U, // <4,6,2,0>: Cost 2 vuzpl RHS, <2,3,0,1>
+ 2779170237U, // <4,6,2,1>: Cost 3 vuzpl RHS, <2,0,1,2>
+ 1705428584U, // <4,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+ 1705428594U, // <4,6,2,3>: Cost 2 vuzpl RHS, <2,2,3,3>
+ 1705428686U, // <4,6,2,4>: Cost 2 vuzpl RHS, <2,3,4,5>
+ 2839560386U, // <4,6,2,5>: Cost 3 vuzpr <3,4,5,6>, <0,2,3,5>
+ 2108882946U, // <4,6,2,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 2109775873U, // <4,6,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 1705428639U, // <4,6,2,u>: Cost 2 vuzpl RHS, <2,2,u,3>
+ 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+ 1705429142U, // <4,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+ 2108850178U, // <4,6,3,2>: Cost 2 ins <4,6,u,2>, lane 2
+ 1705429404U, // <4,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+ 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+ 1705429506U, // <4,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+ 2108882946U, // <4,6,3,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 2132410368U, // <4,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+ 1705429205U, // <4,6,3,u>: Cost 2 vuzpl RHS, <3,0,u,2>
+ 1705430348U, // <4,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+ 2108604419U, // <4,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3
+ 2108850178U, // <4,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2
+ 2108604419U, // <4,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+ 1705430224U, // <4,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4>
+ 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 631688502U, // <4,6,4,6>: Cost 1 vuzpl RHS, RHS
+ 2108604419U, // <4,6,4,7>: Cost 2 ins <4,6,4,u>, lane 3
+ 631688520U, // <4,6,4,u>: Cost 1 vuzpl RHS, RHS
+ 2839563567U, // <4,6,5,0>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,0>
+ 1705439360U, // <4,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3>
+ 1839657466U, // <4,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3>
+ 2839563570U, // <4,6,5,3>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,3>
+ 2839563571U, // <4,6,5,4>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,4>
+ 1705431044U, // <4,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5>
+ 1839649592U, // <4,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6>
+ 1747684662U, // <4,6,5,7>: Cost 2 vuzpr <0,4,2,6>, RHS
+ 1747684663U, // <4,6,5,u>: Cost 2 vuzpr <0,4,2,6>, RHS
+ 1705431886U, // <4,6,6,0>: Cost 2 vuzpl RHS, <6,7,0,1>
+ 2110021633U, // <4,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+ 2110029825U, // <4,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2110038017U, // <4,6,6,3>: Cost 2 ins <4,u,6,3>, lane 1
+ 1705431926U, // <4,6,6,4>: Cost 2 vuzpl RHS, <6,7,4,5>
+ 2110054401U, // <4,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+ 1705431864U, // <4,6,6,6>: Cost 2 vuzpl RHS, <6,6,6,6>
+ 1036328961U, // <4,6,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,6,6,u>: Cost 1 ins RHS, lane 1
+ 2132647936U, // <4,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+ 1705432058U, // <4,6,7,1>: Cost 2 vuzpl RHS, <7,0,1,2>
+ 2108850178U, // <4,6,7,2>: Cost 2 ins <4,6,u,2>, lane 2
+ 2779173980U, // <4,6,7,3>: Cost 3 vuzpl RHS, <7,1,3,1>
+ 2132680704U, // <4,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 1705432422U, // <4,6,7,5>: Cost 2 vuzpl RHS, <7,4,5,6>
+ 2108882946U, // <4,6,7,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 1705432684U, // <4,6,7,7>: Cost 2 vuzpl RHS, <7,7,7,7>
+ 1705432121U, // <4,6,7,u>: Cost 2 vuzpl RHS, <7,0,u,2>
+ 1705433020U, // <4,6,u,0>: Cost 2 vuzpl RHS, <u,3,0,1>
+ 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+ 631691054U, // <4,6,u,2>: Cost 1 vuzpl RHS, LHS
+ 1747681949U, // <4,6,u,3>: Cost 2 vuzpr <0,4,2,6>, LHS
+ 1705433060U, // <4,6,u,4>: Cost 2 vuzpl RHS, <u,3,4,5>
+ 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+ 631691418U, // <4,6,u,6>: Cost 1 vuzpl RHS, RHS
+ 1036328961U, // <4,6,u,7>: Cost 1 ins RHS, lane 1
+ 631691108U, // <4,6,u,u>: Cost 1 vuzpl RHS, LHS
+ 3206537216U, // <4,7,0,0>: Cost 3 ins <u,7,0,0>, lane 0
+ 2132803584U, // <4,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0
+ 2109587457U, // <4,7,0,2>: Cost 2 ins <4,u,0,2>, lane 1
+ 2845614101U, // <4,7,0,3>: Cost 3 vuzpr <4,4,6,7>, <0,0,2,3>
+ 3206569984U, // <4,7,0,4>: Cost 3 ins <u,7,0,4>, lane 0
+ 3047789926U, // <4,7,0,5>: Cost 3 vtrnl <4,6,0,2>, <7,4,5,6>
+ 3047789929U, // <4,7,0,6>: Cost 3 vtrnl <4,6,0,2>, <7,4,6,0>
+ 2109628417U, // <4,7,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 2132803584U, // <4,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0
+ 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+ 3206619136U, // <4,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0
+ 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+ 2132893696U, // <4,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0
+ 3206643712U, // <4,7,1,4>: Cost 3 ins <u,7,1,4>, lane 0
+ 3206651904U, // <4,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0
+ 2988265414U, // <4,7,1,6>: Cost 3 vzipr <5,u,4,1>, <5,4,7,6>
+ 2109702145U, // <4,7,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 2132893696U, // <4,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0
+ 3206684672U, // <4,7,2,0>: Cost 3 ins <u,7,2,0>, lane 0
+ 3206692864U, // <4,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+ 3206701056U, // <4,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0
+ 2132967424U, // <4,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+ 2833597338U, // <4,7,2,4>: Cost 3 vuzpr <2,4,5,7>, <1,2,3,4>
+ 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+ 3206733824U, // <4,7,2,6>: Cost 3 ins <u,7,2,6>, lane 0
+ 2109775873U, // <4,7,2,7>: Cost 2 ins <4,u,2,7>, lane 1
+ 2132967424U, // <4,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0
+ 3206758400U, // <4,7,3,0>: Cost 3 ins <u,7,3,0>, lane 0
+ 3206766592U, // <4,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0
+ 3047388245U, // <4,7,3,2>: Cost 3 vtrnl <4,5,3,7>, <7,1,2,3>
+ 3206782976U, // <4,7,3,3>: Cost 3 ins <u,7,3,3>, lane 0
+ 2989609062U, // <4,7,3,4>: Cost 3 vzipr <6,1,4,3>, <5,6,7,4>
+ 3206799360U, // <4,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0
+ 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+ 2109849601U, // <4,7,3,7>: Cost 2 ins <4,u,3,7>, lane 1
+ 2109849601U, // <4,7,3,u>: Cost 2 ins <4,u,3,7>, lane 1
+ 2583199846U, // <4,7,4,0>: Cost 3 vext1 <5,4,7,4>, LHS
+ 3048117242U, // <4,7,4,1>: Cost 3 vtrnl <4,6,4,6>, <7,0,1,2>
+ 3183624193U, // <4,7,4,2>: Cost 3 ins <4,u,4,2>, lane 1
+ 2979659923U, // <4,7,4,3>: Cost 3 vzipr <4,4,4,4>, <0,1,7,3>
+ 2109898753U, // <4,7,4,4>: Cost 2 ins <4,u,4,4>, lane 1
+ 2133131264U, // <4,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0
+ 2109915137U, // <4,7,4,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 1771875557U, // <4,7,4,7>: Cost 2 vuzpr <4,4,6,7>, <4,4,6,7>
+ 2133131264U, // <4,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0
+ 1839649786U, // <4,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2>
+ 2109947905U, // <4,7,5,1>: Cost 2 ins <4,u,5,1>, lane 1
+ 2913391781U, // <4,7,5,2>: Cost 3 vzipl RHS, <7,2,2,2>
+ 2913391843U, // <4,7,5,3>: Cost 3 vzipl RHS, <7,3,0,1>
+ 1839650150U, // <4,7,5,4>: Cost 2 vzipl RHS, <7,4,5,6>
+ 2109980673U, // <4,7,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 2913392145U, // <4,7,5,6>: Cost 3 vzipl RHS, <7,6,6,6>
+ 1839650412U, // <4,7,5,7>: Cost 2 vzipl RHS, <7,7,7,7>
+ 1839650434U, // <4,7,5,u>: Cost 2 vzipl RHS, <7,u,1,2>
+ 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+ 1973867514U, // <4,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+ 2110029825U, // <4,7,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2110038017U, // <4,7,6,3>: Cost 2 ins <4,u,6,3>, lane 1
+ 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+ 1973867878U, // <4,7,6,5>: Cost 2 vtrnl RHS, <7,4,5,6>
+ 2110062593U, // <4,7,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <4,7,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,7,6,u>: Cost 1 ins RHS, lane 1
+ 2914587642U, // <4,7,7,0>: Cost 3 vzipl <4,7,5,0>, <7,0,1,2>
+ 2779862010U, // <4,7,7,1>: Cost 3 vuzpl <4,6,7,1>, <7,0,1,2>
+ 2779247701U, // <4,7,7,2>: Cost 3 vuzpl <4,5,7,7>, <7,1,2,3>
+ 3207077888U, // <4,7,7,3>: Cost 3 ins <u,7,7,3>, lane 0
+ 2914620774U, // <4,7,7,4>: Cost 3 vzipl <4,7,5,4>, <7,4,5,6>
+ 2779895142U, // <4,7,7,5>: Cost 3 vuzpl <4,6,7,5>, <7,4,5,6>
+ 2992295878U, // <4,7,7,6>: Cost 3 vzipr <6,5,4,7>, <5,4,7,6>
+ 2133368832U, // <4,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0
+ 2133368832U, // <4,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0
+ 1841640442U, // <4,7,u,0>: Cost 2 vzipl RHS, <7,0,1,2>
+ 1974014970U, // <4,7,u,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+ 2109587457U, // <4,7,u,2>: Cost 2 ins <4,u,0,2>, lane 1
+ 2132893696U, // <4,7,u,3>: Cost 2 ins <u,7,1,3>, lane 0
+ 1841640806U, // <4,7,u,4>: Cost 2 vzipl RHS, <7,4,5,6>
+ 1974015334U, // <4,7,u,5>: Cost 2 vtrnl RHS, <7,4,5,6>
+ 2109915137U, // <4,7,u,6>: Cost 2 ins <4,u,4,6>, lane 1
+ 1036328961U, // <4,7,u,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <4,7,u,u>: Cost 1 ins RHS, lane 1
+ 1705574400U, // <4,u,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+ 1034493957U, // <4,u,0,1>: Cost 1 ins RHS, lane 5
+ 631832678U, // <4,u,0,2>: Cost 1 vuzpl RHS, LHS
+ 2108309507U, // <4,u,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+ 1705574604U, // <4,u,0,4>: Cost 2 vuzpl RHS, <0,2,4,6>
+ 2107547650U, // <4,u,0,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 1974048922U, // <4,u,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS
+ 1034485762U, // <4,u,0,7>: Cost 1 ins RHS, lane 2
+ 631832732U, // <4,u,0,u>: Cost 1 vuzpl RHS, LHS
+ 2108170242U, // <4,u,1,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 1705575220U, // <4,u,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+ 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+ 1747624038U, // <4,u,1,3>: Cost 2 vuzpr <0,4,1,u>, LHS
+ 2107539458U, // <4,u,1,4>: Cost 2 ins <4,4,u,4>, lane 2
+ 1705575424U, // <4,u,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+ 2107555842U, // <4,u,1,6>: Cost 2 ins <4,4,u,6>, lane 2
+ 1034485762U, // <4,u,1,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,u,1,u>: Cost 1 ins RHS, lane 2
+ 1705576102U, // <4,u,2,0>: Cost 2 vuzpl RHS, <2,3,0,1>
+ 2104860674U, // <4,u,2,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 1705576040U, // <4,u,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+ 1055244288U, // <4,u,2,3>: Cost 1 ins LHS, lane 0
+ 1705576142U, // <4,u,2,4>: Cost 2 vuzpl RHS, <2,3,4,5>
+ 2107547650U, // <4,u,2,5>: Cost 2 ins <4,4,u,5>, lane 2
+ 2131001344U, // <4,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 1034485762U, // <4,u,2,7>: Cost 1 ins RHS, lane 2
+ 1055244288U, // <4,u,2,u>: Cost 1 ins LHS, lane 0
+ 2129698816U, // <4,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 1705576598U, // <4,u,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+ 2128388096U, // <4,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 1705576860U, // <4,u,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+ 2129731584U, // <4,u,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 1705576962U, // <4,u,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+ 2107555842U, // <4,u,3,6>: Cost 2 ins <4,4,u,6>, lane 2
+ 1034485762U, // <4,u,3,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,u,3,u>: Cost 1 ins RHS, lane 2
+ 1705577804U, // <4,u,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+ 2104860674U, // <4,u,4,1>: Cost 2 ins <4,0,u,1>, lane 2
+ 1974376238U, // <4,u,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS
+ 2108604419U, // <4,u,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+ 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+ 1034493957U, // <4,u,4,5>: Cost 1 ins RHS, lane 5
+ 631835958U, // <4,u,4,6>: Cost 1 vuzpl RHS, RHS
+ 1034485762U, // <4,u,4,7>: Cost 1 ins RHS, lane 2
+ 631835976U, // <4,u,4,u>: Cost 1 vuzpl RHS, RHS
+ 1839650515U, // <4,u,5,0>: Cost 2 vzipl RHS, <u,0,1,2>
+ 765908782U, // <4,u,5,1>: Cost 1 vzipl RHS, LHS
+ 1839650693U, // <4,u,5,2>: Cost 2 vzipl RHS, <u,2,3,0>
+ 2016035485U, // <4,u,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS
+ 1839650879U, // <4,u,5,4>: Cost 2 vzipl RHS, <u,4,5,6>
+ 765909146U, // <4,u,5,5>: Cost 1 vzipl RHS, RHS
+ 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+ 1034485762U, // <4,u,5,7>: Cost 1 ins RHS, lane 2
+ 765909349U, // <4,u,5,u>: Cost 1 vzipl RHS, LHS
+ 1034346499U, // <4,u,6,0>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,u,6,1>: Cost 1 ins RHS, lane 3
+ 900126510U, // <4,u,6,2>: Cost 1 vtrnl RHS, LHS
+ 1034346499U, // <4,u,6,3>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,u,6,4>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <4,u,6,5>: Cost 1 ins RHS, lane 3
+ 900126874U, // <4,u,6,6>: Cost 1 vtrnl RHS, RHS
+ 27705344U, // <4,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,6,u>: Cost 0 copy RHS
+ 2133975044U, // <4,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+ 1705579514U, // <4,u,7,1>: Cost 2 vuzpl RHS, <7,0,1,2>
+ 2104868866U, // <4,u,7,2>: Cost 2 ins <4,0,u,2>, lane 2
+ 2129354752U, // <4,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0
+ 2134007812U, // <4,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+ 1705579878U, // <4,u,7,5>: Cost 2 vuzpl RHS, <7,4,5,6>
+ 2131369984U, // <4,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0
+ 1034485762U, // <4,u,7,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <4,u,7,u>: Cost 1 ins RHS, lane 2
+ 1034346499U, // <4,u,u,0>: Cost 1 ins RHS, lane 3
+ 767899438U, // <4,u,u,1>: Cost 1 vzipl RHS, LHS
+ 631838510U, // <4,u,u,2>: Cost 1 vuzpl RHS, LHS
+ 1055244288U, // <4,u,u,3>: Cost 1 ins LHS, lane 0
+ 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+ 767899802U, // <4,u,u,5>: Cost 1 vzipl RHS, RHS
+ 631838874U, // <4,u,u,6>: Cost 1 vuzpl RHS, RHS
+ 27705344U, // <4,u,u,7>: Cost 0 copy RHS
+ 27705344U, // <4,u,u,u>: Cost 0 copy RHS
+ 2128150528U, // <5,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+ 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+ 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+ 2846220309U, // <5,0,0,3>: Cost 3 vuzpr <4,5,6,0>, <0,0,2,3>
+ 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+ 2583318482U, // <5,0,0,5>: Cost 3 vext1 <5,5,0,0>, <5,5,0,0>
+ 3189334017U, // <5,0,0,6>: Cost 3 ins <5,u,0,6>, lane 1
+ 2846223265U, // <5,0,0,7>: Cost 3 vuzpr <4,5,6,0>, <4,0,6,7>
+ 2128150528U, // <5,0,0,u>: Cost 2 ins <u,0,0,0>, lane 0
+ 1503608934U, // <5,0,1,0>: Cost 2 vext1 <4,5,0,1>, LHS
+ 1843003494U, // <5,0,1,1>: Cost 2 vzipl <5,1,7,3>, LHS
+ 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2115641345U, // <5,0,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 1611612282U, // <5,0,1,4>: Cost 2 vext3 <0,1,4,5>, <0,1,4,5>
+ 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+ 3202015232U, // <5,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0
+ 3189415937U, // <5,0,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+ 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+ 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+ 2128314368U, // <5,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2128322560U, // <5,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+ 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+ 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+ 3189481473U, // <5,0,2,6>: Cost 3 ins <5,u,2,6>, lane 1
+ 2595280262U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,5,0,2>
+ 2128314368U, // <5,0,2,u>: Cost 2 ins <u,0,2,2>, lane 0
+ 3202113536U, // <5,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+ 2918047846U, // <5,0,3,1>: Cost 3 vzipl <5,3,7,0>, LHS
+ 2128388096U, // <5,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 3189530625U, // <5,0,3,3>: Cost 3 ins <5,u,3,3>, lane 1
+ 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+ 2785315330U, // <5,0,3,5>: Cost 3 vuzpl <5,6,0,1>, <3,4,5,6>
+ 3202162688U, // <5,0,3,6>: Cost 3 ins <u,0,3,6>, lane 0
+ 2840323072U, // <5,0,3,7>: Cost 3 vuzpr <3,5,7,0>, <1,3,5,7>
+ 2128388096U, // <5,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+ 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+ 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+ 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+ 3184336899U, // <5,0,4,3>: Cost 3 ins <5,0,4,u>, lane 3
+ 2687345005U, // <5,0,4,4>: Cost 3 vext3 <0,4,4,5>, <0,4,4,5>
+ 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+ 2846222850U, // <5,0,4,6>: Cost 3 vuzpr <4,5,6,0>, <3,4,5,6>
+ 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+ 1845019293U, // <5,0,4,u>: Cost 2 vzipl <5,4,7,6>, LHS
+ 1772481839U, // <5,0,5,0>: Cost 2 vuzpr <4,5,6,0>, <4,5,6,0>
+ 1845526630U, // <5,0,5,1>: Cost 2 vzipl <5,5,5,5>, LHS
+ 1979744358U, // <5,0,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS
+ 3189678081U, // <5,0,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+ 2919268690U, // <5,0,5,4>: Cost 3 vzipl <5,5,5,5>, <0,4,1,5>
+ 2115952641U, // <5,0,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 3202310144U, // <5,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0
+ 2115969025U, // <5,0,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 1845527197U, // <5,0,5,u>: Cost 2 vzipl <5,5,5,5>, LHS
+ 2973777920U, // <5,0,6,0>: Cost 3 vzipr <3,4,5,6>, <0,0,0,0>
+ 1846296678U, // <5,0,6,1>: Cost 2 vzipl <5,6,7,0>, LHS
+ 2128609280U, // <5,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 3189751809U, // <5,0,6,3>: Cost 3 ins <5,u,6,3>, lane 1
+ 2920038738U, // <5,0,6,4>: Cost 3 vzipl <5,6,7,0>, <0,4,1,5>
+ 2920038866U, // <5,0,6,5>: Cost 3 vzipl <5,6,7,0>, <0,5,6,7>
+ 3189776385U, // <5,0,6,6>: Cost 3 ins <5,u,6,6>, lane 1
+ 2128650240U, // <5,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+ 1846297245U, // <5,0,6,u>: Cost 2 vzipl <5,6,7,0>, LHS
+ 2040971264U, // <5,0,7,0>: Cost 2 vtrnr RHS, <0,0,0,0>
+ 2040971274U, // <5,0,7,1>: Cost 2 vtrnr RHS, <0,0,1,1>
+ 2040971284U, // <5,0,7,2>: Cost 2 vtrnr RHS, <0,0,2,2>
+ 2116083713U, // <5,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+ 2116091905U, // <5,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 3114715316U, // <5,0,7,5>: Cost 3 vtrnr RHS, <3,0,4,5>
+ 2116108289U, // <5,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2116116481U, // <5,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 2040971281U, // <5,0,7,u>: Cost 2 vtrnr RHS, <0,0,1,u>
+ 2040979456U, // <5,0,u,0>: Cost 2 vtrnr RHS, <0,0,0,0>
+ 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+ 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2115641345U, // <5,0,u,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 2116091905U, // <5,0,u,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 2115952641U, // <5,0,u,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 2116108289U, // <5,0,u,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2115969025U, // <5,0,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+ 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+ 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+ 1712324710U, // <5,1,0,2>: Cost 2 vuzpl <5,7,1,3>, LHS
+ 2111512578U, // <5,1,0,3>: Cost 2 ins <5,1,u,3>, lane 2
+ 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+ 2977710418U, // <5,1,0,5>: Cost 3 vzipr <4,1,5,0>, <0,4,1,5>
+ 3185278978U, // <5,1,0,6>: Cost 3 ins <5,1,u,6>, lane 2
+ 3184705539U, // <5,1,0,7>: Cost 3 ins <5,1,0,u>, lane 3
+ 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+ 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+ 2128896000U, // <5,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+ 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+ 2115641345U, // <5,1,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+ 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+ 3189407745U, // <5,1,1,6>: Cost 3 ins <5,u,1,6>, lane 1
+ 2982367283U, // <5,1,1,7>: Cost 3 vzipr <4,u,5,1>, <5,6,1,7>
+ 2115641345U, // <5,1,1,u>: Cost 2 ins <5,u,1,3>, lane 1
+ 2128961536U, // <5,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <5,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <5,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <5,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <5,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <5,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <5,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <5,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <5,1,2,u>: Cost 1 ins LHS, lane 0
+ 2571468902U, // <5,1,3,0>: Cost 3 vext1 <3,5,1,3>, LHS
+ 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+ 2571470542U, // <5,1,3,2>: Cost 3 vext1 <3,5,1,3>, <2,3,4,5>
+ 2129059840U, // <5,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+ 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+ 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+ 2595361654U, // <5,1,3,6>: Cost 3 vext1 <7,5,1,3>, <6,7,4,5>
+ 2840331264U, // <5,1,3,7>: Cost 3 vuzpr <3,5,7,1>, <1,3,5,7>
+ 2129059840U, // <5,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+ 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+ 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+ 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+ 2111512578U, // <5,1,4,3>: Cost 2 ins <5,1,u,3>, lane 2
+ 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+ 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 1712327990U, // <5,1,4,6>: Cost 2 vuzpl <5,7,1,3>, RHS
+ 3185000451U, // <5,1,4,7>: Cost 3 ins <5,1,4,u>, lane 3
+ 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+ 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+ 1712328832U, // <5,1,5,1>: Cost 2 vuzpl <5,7,1,3>, <5,7,1,3>
+ 2982398102U, // <5,1,5,2>: Cost 3 vzipr <4,u,5,5>, <3,0,1,2>
+ 2046853222U, // <5,1,5,3>: Cost 2 vtrnr <5,5,5,5>, LHS
+ 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+ 2115952641U, // <5,1,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+ 2115969025U, // <5,1,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 2046853227U, // <5,1,5,u>: Cost 2 vtrnr <5,5,5,5>, LHS
+ 2920039158U, // <5,1,6,0>: Cost 3 vzipl <5,6,7,0>, <1,0,3,2>
+ 2961834642U, // <5,1,6,1>: Cost 3 vzipr <1,4,5,6>, <0,u,1,1>
+ 2973780118U, // <5,1,6,2>: Cost 3 vzipr <3,4,5,6>, <3,0,1,2>
+ 2111512578U, // <5,1,6,3>: Cost 2 ins <5,1,u,3>, lane 2
+ 2224227480U, // <5,1,6,4>: Cost 3 vrev <1,5,4,6>
+ 2973778258U, // <5,1,6,5>: Cost 3 vzipr <3,4,5,6>, <0,4,1,5>
+ 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+ 2111553541U, // <5,1,6,7>: Cost 2 ins <5,1,u,u>, lane 5
+ 2111512578U, // <5,1,6,u>: Cost 2 ins <5,1,u,3>, lane 2
+ 2116059137U, // <5,1,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2040972084U, // <5,1,7,1>: Cost 2 vtrnr RHS, <1,1,1,1>
+ 2111479811U, // <5,1,7,2>: Cost 2 ins <5,1,7,u>, lane 3
+ 967229542U, // <5,1,7,3>: Cost 1 vtrnr RHS, LHS
+ 2116091905U, // <5,1,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 2111479811U, // <5,1,7,5>: Cost 2 ins <5,1,7,u>, lane 3
+ 2116108289U, // <5,1,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2116116481U, // <5,1,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 967229547U, // <5,1,7,u>: Cost 1 vtrnr RHS, LHS
+ 2116059137U, // <5,1,u,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2040980276U, // <5,1,u,1>: Cost 2 vtrnr RHS, <1,1,1,1>
+ 1712330542U, // <5,1,u,2>: Cost 2 vuzpl <5,7,1,3>, LHS
+ 967237734U, // <5,1,u,3>: Cost 1 vtrnr RHS, LHS
+ 2116091905U, // <5,1,u,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+ 1712330906U, // <5,1,u,6>: Cost 2 vuzpl <5,7,1,3>, RHS
+ 2115969025U, // <5,1,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 967237739U, // <5,1,u,u>: Cost 1 vtrnr RHS, LHS
+ 2786132132U, // <5,2,0,0>: Cost 3 vuzpl <5,7,2,2>, <0,2,0,2>
+ 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2129494016U, // <5,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 2973728870U, // <5,2,0,3>: Cost 3 vzipr <3,4,5,0>, LHS
+ 2786164940U, // <5,2,0,4>: Cost 3 vuzpl <5,7,2,6>, <0,2,4,6>
+ 2782158977U, // <5,2,0,5>: Cost 3 vuzpl <5,1,2,3>, <0,1,5,3>
+ 3185942530U, // <5,2,0,6>: Cost 3 ins <5,2,u,6>, lane 2
+ 3114658883U, // <5,2,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,2,6,7>
+ 2129494016U, // <5,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0
+ 3054503590U, // <5,2,1,0>: Cost 3 vtrnl <5,7,1,3>, <2,3,0,1>
+ 3203301376U, // <5,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+ 2982363156U, // <5,2,1,2>: Cost 3 vzipr <4,u,5,1>, <0,0,2,2>
+ 1908621414U, // <5,2,1,3>: Cost 2 vzipr <4,u,5,1>, LHS
+ 3054503630U, // <5,2,1,4>: Cost 3 vtrnl <5,7,1,3>, <2,3,4,5>
+ 2601390208U, // <5,2,1,5>: Cost 3 vext1 <u,5,2,1>, <5,7,1,3>
+ 2982363484U, // <5,2,1,6>: Cost 3 vzipr <4,u,5,1>, <0,4,2,6>
+ 3189415937U, // <5,2,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+ 1908621419U, // <5,2,1,u>: Cost 2 vzipr <4,u,5,1>, LHS
+ 3203366912U, // <5,2,2,0>: Cost 3 ins <u,2,2,0>, lane 0
+ 3203375104U, // <5,2,2,1>: Cost 3 ins <u,2,2,1>, lane 0
+ 2129641472U, // <5,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+ 2129649664U, // <5,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0
+ 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+ 2698036870U, // <5,2,2,5>: Cost 3 vext3 <2,2,5,5>, <2,2,5,5>
+ 3189481473U, // <5,2,2,6>: Cost 3 ins <5,u,2,6>, lane 1
+ 2846239811U, // <5,2,2,7>: Cost 3 vuzpr <4,5,6,2>, <4,2,6,7>
+ 2129641472U, // <5,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0
+ 2129698816U, // <5,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+ 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+ 2129723392U, // <5,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+ 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+ 2717943511U, // <5,2,3,5>: Cost 3 vext3 <5,5,5,5>, <2,3,5,5>
+ 3203489792U, // <5,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0
+ 2827879424U, // <5,2,3,7>: Cost 3 vuzpr <1,5,0,2>, <1,3,5,7>
+ 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+ 3203514368U, // <5,2,4,0>: Cost 3 ins <u,2,4,0>, lane 0
+ 3189587969U, // <5,2,4,1>: Cost 3 ins <5,u,4,1>, lane 1
+ 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+ 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+ 3203547136U, // <5,2,4,4>: Cost 3 ins <u,2,4,4>, lane 0
+ 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+ 2129821696U, // <5,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 2846239973U, // <5,2,4,7>: Cost 3 vuzpr <4,5,6,2>, <4,4,6,7>
+ 2129821696U, // <5,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0
+ 3053487782U, // <5,2,5,0>: Cost 3 vtrnl <5,5,5,5>, <2,3,0,1>
+ 3203596288U, // <5,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0
+ 1772498225U, // <5,2,5,2>: Cost 2 vuzpr <4,5,6,2>, <4,5,6,2>
+ 1908654182U, // <5,2,5,3>: Cost 2 vzipr <4,u,5,5>, LHS
+ 3053487822U, // <5,2,5,4>: Cost 3 vtrnl <5,5,5,5>, <2,3,4,5>
+ 2115952641U, // <5,2,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 2982396252U, // <5,2,5,6>: Cost 3 vzipr <4,u,5,5>, <0,4,2,6>
+ 2115969025U, // <5,2,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 1908654187U, // <5,2,5,u>: Cost 2 vzipr <4,u,5,5>, LHS
+ 3203661824U, // <5,2,6,0>: Cost 3 ins <u,2,6,0>, lane 0
+ 3189735425U, // <5,2,6,1>: Cost 3 ins <5,u,6,1>, lane 1
+ 2973777940U, // <5,2,6,2>: Cost 3 vzipr <3,4,5,6>, <0,0,2,2>
+ 1900036198U, // <5,2,6,3>: Cost 2 vzipr <3,4,5,6>, LHS
+ 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+ 2973778186U, // <5,2,6,5>: Cost 3 vzipr <3,4,5,6>, <0,3,2,5>
+ 2973778268U, // <5,2,6,6>: Cost 3 vzipr <3,4,5,6>, <0,4,2,6>
+ 2129977344U, // <5,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+ 1900036203U, // <5,2,6,u>: Cost 2 vzipr <3,4,5,6>, LHS
+ 2040972182U, // <5,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+ 3114713251U, // <5,2,7,1>: Cost 3 vtrnr RHS, <0,2,0,1>
+ 2040971428U, // <5,2,7,2>: Cost 2 vtrnr RHS, <0,2,0,2>
+ 1887436902U, // <5,2,7,3>: Cost 2 vzipr <1,3,5,7>, LHS
+ 2040972186U, // <5,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4>
+ 2961178728U, // <5,2,7,5>: Cost 3 vzipr <1,3,5,7>, <0,1,2,5>
+ 2040971468U, // <5,2,7,6>: Cost 2 vtrnr RHS, <0,2,4,6>
+ 2116116481U, // <5,2,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 1887436907U, // <5,2,7,u>: Cost 2 vzipr <1,3,5,7>, LHS
+ 2040980374U, // <5,2,u,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+ 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+ 2040979620U, // <5,2,u,2>: Cost 2 vtrnr RHS, <0,2,0,2>
+ 1887445094U, // <5,2,u,3>: Cost 2 vzipr <1,3,5,u>, LHS
+ 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+ 2115952641U, // <5,2,u,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 2040979660U, // <5,2,u,6>: Cost 2 vtrnr RHS, <0,2,4,6>
+ 2115969025U, // <5,2,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 1887445099U, // <5,2,u,u>: Cost 2 vzipr <1,3,5,u>, LHS
+ 3203883008U, // <5,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+ 2130149376U, // <5,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+ 2782904422U, // <5,3,0,2>: Cost 3 vuzpl <5,2,3,4>, LHS
+ 3186581506U, // <5,3,0,3>: Cost 3 ins <5,3,u,3>, lane 2
+ 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+ 3053750786U, // <5,3,0,5>: Cost 3 vtrnl <5,6,0,1>, <3,4,5,6>
+ 2618302971U, // <5,3,0,6>: Cost 3 vext2 <0,1,5,3>, <0,6,2,3>
+ 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+ 2130149376U, // <5,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+ 2982364054U, // <5,3,1,0>: Cost 3 vzipr <4,u,5,1>, <1,2,3,0>
+ 3054504086U, // <5,3,1,1>: Cost 3 vtrnl <5,7,1,3>, <3,0,1,2>
+ 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+ 2130239488U, // <5,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+ 2982364058U, // <5,3,1,4>: Cost 3 vzipr <4,u,5,1>, <1,2,3,4>
+ 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+ 3189407745U, // <5,3,1,6>: Cost 3 ins <5,u,1,6>, lane 1
+ 2964448400U, // <5,3,1,7>: Cost 3 vzipr <1,u,5,1>, <1,5,3,7>
+ 2130239488U, // <5,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+ 2235845154U, // <5,3,2,0>: Cost 3 vrev <3,5,0,2>
+ 3204038656U, // <5,3,2,1>: Cost 3 ins <u,3,2,1>, lane 0
+ 3204046848U, // <5,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0
+ 2130313216U, // <5,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+ 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+ 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+ 3204079616U, // <5,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0
+ 3096314880U, // <5,3,2,7>: Cost 3 vtrnr <1,5,0,2>, <1,3,5,7>
+ 2130313216U, // <5,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+ 3204104192U, // <5,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0
+ 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+ 3204120576U, // <5,3,3,2>: Cost 3 ins <u,3,3,2>, lane 0
+ 2130386944U, // <5,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+ 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+ 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+ 3189555201U, // <5,3,3,6>: Cost 3 ins <5,u,3,6>, lane 1
+ 2971763856U, // <5,3,3,7>: Cost 3 vzipr <3,1,5,3>, <1,5,3,7>
+ 2130386944U, // <5,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+ 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+ 2642193381U, // <5,3,4,1>: Cost 3 vext2 <4,1,5,3>, <4,1,5,3>
+ 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+ 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+ 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+ 2130477056U, // <5,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0
+ 2846247426U, // <5,3,4,6>: Cost 3 vuzpr <4,5,6,3>, <3,4,5,6>
+ 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+ 2130477056U, // <5,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0
+ 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+ 3053488278U, // <5,3,5,1>: Cost 3 vtrnl <5,5,5,5>, <3,0,1,2>
+ 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+ 1748320682U, // <5,3,5,3>: Cost 2 vuzpr <0,5,2,3>, <0,5,2,3>
+ 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+ 2115952641U, // <5,3,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 3204300800U, // <5,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0
+ 2130567168U, // <5,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+ 2130567168U, // <5,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+ 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+ 3204333568U, // <5,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0
+ 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+ 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+ 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+ 2973778114U, // <5,3,6,5>: Cost 3 vzipr <3,4,5,6>, <0,2,3,5>
+ 2973779816U, // <5,3,6,6>: Cost 3 vzipr <3,4,5,6>, <2,5,3,6>
+ 2130640896U, // <5,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+ 2130640896U, // <5,3,6,u>: Cost 2 ins <u,3,6,7>, lane 0
+ 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+ 2961179382U, // <5,3,7,2>: Cost 3 vzipr <1,3,5,7>, <1,0,3,2>
+ 2040972248U, // <5,3,7,3>: Cost 2 vtrnr RHS, <1,3,1,3>
+ 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+ 2040973006U, // <5,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+ 2116108289U, // <5,3,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2040972288U, // <5,3,7,7>: Cost 2 vtrnr RHS, <1,3,5,7>
+ 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+ 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+ 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+ 2961187574U, // <5,3,u,2>: Cost 3 vzipr <1,3,5,u>, <1,0,3,2>
+ 2040980440U, // <5,3,u,3>: Cost 2 vtrnr RHS, <1,3,1,3>
+ 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+ 2040981198U, // <5,3,u,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+ 2116108289U, // <5,3,u,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2040980480U, // <5,3,u,7>: Cost 2 vtrnr RHS, <1,3,5,7>
+ 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+ 3189284865U, // <5,4,0,0>: Cost 3 ins <5,u,0,0>, lane 1
+ 2113544197U, // <5,4,0,1>: Cost 2 ins <5,4,u,u>, lane 5
+ 2781626470U, // <5,4,0,2>: Cost 3 vuzpl <5,0,4,1>, LHS
+ 2242022676U, // <5,4,0,3>: Cost 3 vrev <4,5,3,0>
+ 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+ 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+ 2113527810U, // <5,4,0,6>: Cost 2 ins <5,4,u,6>, lane 2
+ 3114659045U, // <5,4,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,4,6,7>
+ 2113544197U, // <5,4,0,u>: Cost 2 ins <5,4,u,u>, lane 5
+ 1168067834U, // <5,4,1,0>: Cost 2 vrev <4,5,0,1>
+ 3189366785U, // <5,4,1,1>: Cost 3 ins <5,u,1,1>, lane 1
+ 3204636672U, // <5,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+ 2115641345U, // <5,4,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 2982366416U, // <5,4,1,4>: Cost 3 vzipr <4,u,5,1>, <4,4,4,4>
+ 1843006774U, // <5,4,1,5>: Cost 2 vzipl <5,1,7,3>, RHS
+ 1980763446U, // <5,4,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS
+ 3189415937U, // <5,4,1,7>: Cost 3 ins <5,u,1,7>, lane 1
+ 1843007017U, // <5,4,1,u>: Cost 2 vzipl <5,1,7,3>, RHS
+ 3204694016U, // <5,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+ 2241891588U, // <5,4,2,1>: Cost 3 vrev <4,5,1,2>
+ 3189448705U, // <5,4,2,2>: Cost 3 ins <5,u,2,2>, lane 1
+ 2113544197U, // <5,4,2,3>: Cost 2 ins <5,4,u,u>, lane 5
+ 3204726784U, // <5,4,2,4>: Cost 3 ins <u,4,2,4>, lane 0
+ 2973746894U, // <5,4,2,5>: Cost 3 vzipr <3,4,5,2>, <2,3,4,5>
+ 2131001344U, // <5,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 3114675429U, // <5,4,2,7>: Cost 3 vtrnr <4,5,6,2>, <4,4,6,7>
+ 2113544197U, // <5,4,2,u>: Cost 2 ins <5,4,u,u>, lane 5
+ 3204767744U, // <5,4,3,0>: Cost 3 ins <u,4,3,0>, lane 0
+ 2241899781U, // <5,4,3,1>: Cost 3 vrev <4,5,1,3>
+ 1168231694U, // <5,4,3,2>: Cost 2 vrev <4,5,2,3>
+ 3189530625U, // <5,4,3,3>: Cost 3 ins <5,u,3,3>, lane 1
+ 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+ 2978399950U, // <5,4,3,5>: Cost 3 vzipr <4,2,5,3>, <2,3,4,5>
+ 2113527810U, // <5,4,3,6>: Cost 2 ins <5,4,u,6>, lane 2
+ 2840355840U, // <5,4,3,7>: Cost 3 vuzpr <3,5,7,4>, <1,3,5,7>
+ 2113527810U, // <5,4,3,u>: Cost 2 ins <5,4,u,6>, lane 2
+ 2918763410U, // <5,4,4,0>: Cost 3 vzipl <5,4,7,6>, <4,0,5,1>
+ 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+ 3186991107U, // <5,4,4,2>: Cost 3 ins <5,4,4,u>, lane 3
+ 3186991107U, // <5,4,4,3>: Cost 3 ins <5,4,4,u>, lane 3
+ 2131132416U, // <5,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+ 1845022006U, // <5,4,4,5>: Cost 2 vzipl <5,4,7,6>, RHS
+ 2113527810U, // <5,4,4,6>: Cost 2 ins <5,4,u,6>, lane 2
+ 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+ 1845022249U, // <5,4,4,u>: Cost 2 vzipl <5,4,7,6>, RHS
+ 1503936614U, // <5,4,5,0>: Cost 2 vext1 <4,5,4,5>, LHS
+ 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+ 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+ 3189678081U, // <5,4,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+ 1168395554U, // <5,4,5,4>: Cost 2 vrev <4,5,4,5>
+ 1845529910U, // <5,4,5,5>: Cost 2 vzipl <5,5,5,5>, RHS
+ 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2115969025U, // <5,4,5,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+ 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+ 2559771800U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,5,4,6>
+ 3189743617U, // <5,4,6,2>: Cost 3 ins <5,u,6,2>, lane 1
+ 2571717194U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,5,4,6>
+ 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+ 1846299958U, // <5,4,6,5>: Cost 2 vzipl <5,6,7,0>, RHS
+ 2131296256U, // <5,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2113544197U, // <5,4,6,7>: Cost 2 ins <5,4,u,u>, lane 5
+ 1846300201U, // <5,4,6,u>: Cost 2 vzipl <5,6,7,0>, RHS
+ 2116059137U, // <5,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2113470467U, // <5,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3
+ 2113470467U, // <5,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3
+ 2116083713U, // <5,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+ 2040974544U, // <5,4,7,4>: Cost 2 vtrnr RHS, <4,4,4,4>
+ 2040971602U, // <5,4,7,5>: Cost 2 vtrnr RHS, <0,4,1,5>
+ 94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+ 2116116481U, // <5,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+ 2116059137U, // <5,4,u,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2113544197U, // <5,4,u,1>: Cost 2 ins <5,4,u,u>, lane 5
+ 2113470467U, // <5,4,u,2>: Cost 2 ins <5,4,7,u>, lane 3
+ 2115641345U, // <5,4,u,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 2040982736U, // <5,4,u,4>: Cost 2 vtrnr RHS, <4,4,4,4>
+ 2040979794U, // <5,4,u,5>: Cost 2 vtrnr RHS, <0,4,1,5>
+ 94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+ 2115969025U, // <5,4,u,7>: Cost 2 ins <5,u,5,7>, lane 1
+ 94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+ 2040917295U, // <5,5,0,0>: Cost 2 vtrnr <4,5,6,0>, <4,5,6,0>
+ 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 1711308902U, // <5,5,0,2>: Cost 2 vuzpl <5,5,5,5>, LHS
+ 3187908610U, // <5,5,0,3>: Cost 3 ins <5,5,u,3>, lane 2
+ 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+ 2114183170U, // <5,5,0,5>: Cost 2 ins <5,5,u,5>, lane 2
+ 3187933186U, // <5,5,0,6>: Cost 3 ins <5,5,u,6>, lane 2
+ 2114199554U, // <5,5,0,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+ 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+ 1908624922U, // <5,5,1,1>: Cost 2 vzipr <4,u,5,1>, <4,u,5,1>
+ 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+ 1778417766U, // <5,5,1,3>: Cost 2 vuzpr <5,5,5,5>, LHS
+ 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+ 2114183170U, // <5,5,1,5>: Cost 2 ins <5,5,u,5>, lane 2
+ 2982365698U, // <5,5,1,6>: Cost 3 vzipr <4,u,5,1>, <3,4,5,6>
+ 2114199554U, // <5,5,1,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1778417771U, // <5,5,1,u>: Cost 2 vuzpr <5,5,5,5>, LHS
+ 2785052326U, // <5,5,2,0>: Cost 3 vuzpl <5,5,5,5>, <2,3,0,1>
+ 3205365760U, // <5,5,2,1>: Cost 3 ins <u,5,2,1>, lane 0
+ 2040933681U, // <5,5,2,2>: Cost 2 vtrnr <4,5,6,2>, <4,5,6,2>
+ 2114207749U, // <5,5,2,3>: Cost 2 ins <5,5,u,u>, lane 5
+ 2785052366U, // <5,5,2,4>: Cost 3 vuzpl <5,5,5,5>, <2,3,4,5>
+ 2114183170U, // <5,5,2,5>: Cost 2 ins <5,5,u,5>, lane 2
+ 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+ 2114199554U, // <5,5,2,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 2114207749U, // <5,5,2,u>: Cost 2 ins <5,5,u,u>, lane 5
+ 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+ 2785052822U, // <5,5,3,1>: Cost 3 vuzpl <5,5,5,5>, <3,0,1,2>
+ 3187900418U, // <5,5,3,2>: Cost 3 ins <5,5,u,2>, lane 2
+ 1880105089U, // <5,5,3,3>: Cost 2 vzipr <0,1,5,3>, <0,1,5,3>
+ 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+ 2114183170U, // <5,5,3,5>: Cost 2 ins <5,5,u,5>, lane 2
+ 3205480448U, // <5,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0
+ 2131746816U, // <5,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+ 2131746816U, // <5,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+ 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+ 2716987279U, // <5,5,4,1>: Cost 3 vext3 <5,4,1,5>, <5,4,1,5>
+ 3187900418U, // <5,5,4,2>: Cost 3 ins <5,5,u,2>, lane 2
+ 3187908610U, // <5,5,4,3>: Cost 3 ins <5,5,u,3>, lane 2
+ 1845022662U, // <5,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+ 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+ 1711312182U, // <5,5,4,6>: Cost 2 vuzpl <5,5,5,5>, RHS
+ 2114199554U, // <5,5,4,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+ 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 2113986563U, // <5,5,5,1>: Cost 2 ins <5,5,5,u>, lane 3
+ 2113986563U, // <5,5,5,2>: Cost 2 ins <5,5,5,u>, lane 3
+ 2113986563U, // <5,5,5,3>: Cost 2 ins <5,5,5,u>, lane 3
+ 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+ 2113986563U, // <5,5,5,6>: Cost 2 ins <5,5,5,u>, lane 3
+ 1778421046U, // <5,5,5,7>: Cost 2 vuzpr <5,5,5,5>, RHS
+ 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+ 2131910656U, // <5,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <5,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <5,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <5,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <5,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <5,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 1900038658U, // <5,5,6,6>: Cost 2 vzipr <3,4,5,6>, <3,4,5,6>
+ 1058226176U, // <5,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <5,5,6,u>: Cost 1 ins RHS, lane 0
+ 2116059137U, // <5,5,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2114134019U, // <5,5,7,1>: Cost 2 ins <5,5,7,u>, lane 3
+ 2114134019U, // <5,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3
+ 2116083713U, // <5,5,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+ 2116091905U, // <5,5,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 2040975364U, // <5,5,7,5>: Cost 2 vtrnr RHS, <5,5,5,5>
+ 2116108289U, // <5,5,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 967232822U, // <5,5,7,7>: Cost 1 vtrnr RHS, RHS
+ 967232823U, // <5,5,7,u>: Cost 1 vtrnr RHS, RHS
+ 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+ 1711314734U, // <5,5,u,2>: Cost 2 vuzpl <5,5,5,5>, LHS
+ 1778418333U, // <5,5,u,3>: Cost 2 vuzpr <5,5,5,5>, LHS
+ 1845022662U, // <5,5,u,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+ 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+ 1711315098U, // <5,5,u,6>: Cost 2 vuzpl <5,5,5,5>, RHS
+ 967241014U, // <5,5,u,7>: Cost 1 vtrnr RHS, RHS
+ 967241015U, // <5,5,u,u>: Cost 1 vtrnr RHS, RHS
+ 2114805762U, // <5,6,0,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2132148224U, // <5,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+ 2114838530U, // <5,6,0,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 3188588546U, // <5,6,0,5>: Cost 3 ins <5,6,u,5>, lane 2
+ 3188596738U, // <5,6,0,6>: Cost 3 ins <5,6,u,6>, lane 2
+ 2973732150U, // <5,6,0,7>: Cost 3 vzipr <3,4,5,0>, RHS
+ 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+ 2114805762U, // <5,6,1,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+ 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+ 2115641345U, // <5,6,1,3>: Cost 2 ins <5,u,1,3>, lane 1
+ 2114838530U, // <5,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+ 2982366436U, // <5,6,1,6>: Cost 3 vzipr <4,u,5,1>, <4,4,6,6>
+ 1908624694U, // <5,6,1,7>: Cost 2 vzipr <4,u,5,1>, RHS
+ 1908624695U, // <5,6,1,u>: Cost 2 vzipr <4,u,5,1>, RHS
+ 2114805762U, // <5,6,2,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 3188555778U, // <5,6,2,1>: Cost 3 ins <5,6,u,1>, lane 2
+ 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+ 2114871301U, // <5,6,2,3>: Cost 2 ins <5,6,u,u>, lane 5
+ 2114838530U, // <5,6,2,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+ 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+ 2964458806U, // <5,6,2,7>: Cost 3 vzipr <1,u,5,2>, RHS
+ 2114805762U, // <5,6,2,u>: Cost 2 ins <5,6,u,0>, lane 2
+ 2114805762U, // <5,6,3,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 3206103040U, // <5,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0
+ 3206111232U, // <5,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+ 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+ 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+ 2783119874U, // <5,6,3,5>: Cost 3 vuzpl <5,2,6,3>, <3,4,5,6>
+ 3206144000U, // <5,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0
+ 2132410368U, // <5,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+ 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+ 2114805762U, // <5,6,4,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 3189587969U, // <5,6,4,1>: Cost 3 ins <5,u,4,1>, lane 1
+ 2918765050U, // <5,6,4,2>: Cost 3 vzipl <5,4,7,6>, <6,2,7,3>
+ 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+ 2114838530U, // <5,6,4,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2132475904U, // <5,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 2972437814U, // <5,6,4,7>: Cost 3 vzipr <3,2,5,4>, RHS
+ 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+ 2114805762U, // <5,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+ 2982398876U, // <5,6,5,2>: Cost 3 vzipr <4,u,5,5>, <4,0,6,2>
+ 3189678081U, // <5,6,5,3>: Cost 3 ins <5,u,5,3>, lane 1
+ 2114838530U, // <5,6,5,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 2115952641U, // <5,6,5,5>: Cost 2 ins <5,u,5,5>, lane 1
+ 1772530997U, // <5,6,5,6>: Cost 2 vuzpr <4,5,6,6>, <4,5,6,6>
+ 1908657462U, // <5,6,5,7>: Cost 2 vzipr <4,u,5,5>, RHS
+ 1908657463U, // <5,6,5,u>: Cost 2 vzipr <4,u,5,5>, RHS
+ 2114805762U, // <5,6,6,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 3189735425U, // <5,6,6,1>: Cost 3 ins <5,u,6,1>, lane 1
+ 2920043002U, // <5,6,6,2>: Cost 3 vzipl <5,6,7,0>, <6,2,7,3>
+ 2973781298U, // <5,6,6,3>: Cost 3 vzipr <3,4,5,6>, <4,5,6,3>
+ 2114838530U, // <5,6,6,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 2973781138U, // <5,6,6,5>: Cost 3 vzipr <3,4,5,6>, <4,3,6,5>
+ 2132623360U, // <5,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0
+ 1900039478U, // <5,6,6,7>: Cost 2 vzipr <3,4,5,6>, RHS
+ 1900039479U, // <5,6,6,u>: Cost 2 vzipr <3,4,5,6>, RHS
+ 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+ 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+ 1887440182U, // <5,6,7,7>: Cost 2 vzipr <1,3,5,7>, RHS
+ 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+ 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+ 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+ 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+ 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+ 1887448374U, // <5,6,u,7>: Cost 2 vzipr <1,3,5,u>, RHS
+ 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+ 1772535808U, // <5,7,0,0>: Cost 2 vuzpr RHS, <0,0,0,0>
+ 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 1772535828U, // <5,7,0,2>: Cost 2 vuzpr RHS, <0,0,2,2>
+ 2115493890U, // <5,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+ 2846279860U, // <5,7,0,5>: Cost 3 vuzpr RHS, <3,0,4,5>
+ 2846277674U, // <5,7,0,6>: Cost 3 vuzpr RHS, <0,0,4,6>
+ 2115526658U, // <5,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+ 2115018755U, // <5,7,1,0>: Cost 2 ins <5,7,1,u>, lane 3
+ 1772536628U, // <5,7,1,1>: Cost 2 vuzpr RHS, <1,1,1,1>
+ 2115018755U, // <5,7,1,2>: Cost 2 ins <5,7,1,u>, lane 3
+ 698794086U, // <5,7,1,3>: Cost 1 vuzpr RHS, LHS
+ 2115018755U, // <5,7,1,4>: Cost 2 ins <5,7,1,u>, lane 3
+ 2115018755U, // <5,7,1,5>: Cost 2 ins <5,7,1,u>, lane 3
+ 2115018755U, // <5,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3
+ 2115526658U, // <5,7,1,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 698794091U, // <5,7,1,u>: Cost 1 vuzpr RHS, LHS
+ 1772536726U, // <5,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+ 2846277795U, // <5,7,2,1>: Cost 3 vuzpr RHS, <0,2,0,1>
+ 1772535972U, // <5,7,2,2>: Cost 2 vuzpr RHS, <0,2,0,2>
+ 1772537458U, // <5,7,2,3>: Cost 2 vuzpr RHS, <2,2,3,3>
+ 1772536730U, // <5,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+ 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+ 1772536012U, // <5,7,2,6>: Cost 2 vuzpr RHS, <0,2,4,6>
+ 2115526658U, // <5,7,2,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 1772535978U, // <5,7,2,u>: Cost 2 vuzpr RHS, <0,2,0,u>
+ 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+ 1772537510U, // <5,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+ 2846278606U, // <5,7,3,2>: Cost 3 vuzpr RHS, <1,3,0,2>
+ 1772536792U, // <5,7,3,3>: Cost 2 vuzpr RHS, <1,3,1,3>
+ 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+ 1772537550U, // <5,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+ 2846278628U, // <5,7,3,6>: Cost 3 vuzpr RHS, <1,3,2,6>
+ 1772536832U, // <5,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+ 1772536797U, // <5,7,3,u>: Cost 2 vuzpr RHS, <1,3,1,u>
+ 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+ 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+ 2846277958U, // <5,7,4,2>: Cost 3 vuzpr RHS, <0,4,0,2>
+ 2115493890U, // <5,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1772539088U, // <5,7,4,4>: Cost 2 vuzpr RHS, <4,4,4,4>
+ 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 1772536156U, // <5,7,4,6>: Cost 2 vuzpr RHS, <0,4,2,6>
+ 2115526658U, // <5,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+ 2115313667U, // <5,7,5,0>: Cost 2 ins <5,7,5,u>, lane 3
+ 2115313667U, // <5,7,5,1>: Cost 2 ins <5,7,5,u>, lane 3
+ 2115313667U, // <5,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3
+ 2115493890U, // <5,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 2115313667U, // <5,7,5,4>: Cost 2 ins <5,7,5,u>, lane 3
+ 1772539908U, // <5,7,5,5>: Cost 2 vuzpr RHS, <5,5,5,5>
+ 2115313667U, // <5,7,5,6>: Cost 2 ins <5,7,5,u>, lane 3
+ 698797366U, // <5,7,5,7>: Cost 1 vuzpr RHS, RHS
+ 698797367U, // <5,7,5,u>: Cost 1 vuzpr RHS, RHS
+ 1772540002U, // <5,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+ 2846279577U, // <5,7,6,1>: Cost 3 vuzpr RHS, <2,6,0,1>
+ 1772539212U, // <5,7,6,2>: Cost 2 vuzpr RHS, <4,6,0,2>
+ 2115493890U, // <5,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1772540006U, // <5,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+ 2846279617U, // <5,7,6,5>: Cost 3 vuzpr RHS, <2,6,4,5>
+ 1772539252U, // <5,7,6,6>: Cost 2 vuzpr RHS, <4,6,4,6>
+ 1772537786U, // <5,7,6,7>: Cost 2 vuzpr RHS, <2,6,3,7>
+ 1772537787U, // <5,7,6,u>: Cost 2 vuzpr RHS, <2,6,3,u>
+ 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+ 1772540750U, // <5,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1>
+ 2846281846U, // <5,7,7,2>: Cost 3 vuzpr RHS, <5,7,0,2>
+ 1772540032U, // <5,7,7,3>: Cost 2 vuzpr RHS, <5,7,1,3>
+ 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+ 1772540790U, // <5,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5>
+ 2116108289U, // <5,7,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 1772540072U, // <5,7,7,7>: Cost 2 vuzpr RHS, <5,7,5,7>
+ 1772540037U, // <5,7,7,u>: Cost 2 vuzpr RHS, <5,7,1,u>
+ 1772537212U, // <5,7,u,0>: Cost 2 vuzpr RHS, <1,u,3,0>
+ 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+ 1772536458U, // <5,7,u,2>: Cost 2 vuzpr RHS, <0,u,0,2>
+ 698794653U, // <5,7,u,3>: Cost 1 vuzpr RHS, LHS
+ 1772537216U, // <5,7,u,4>: Cost 2 vuzpr RHS, <1,u,3,4>
+ 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+ 1772536480U, // <5,7,u,6>: Cost 2 vuzpr RHS, <0,u,2,6>
+ 698797609U, // <5,7,u,7>: Cost 1 vuzpr RHS, RHS
+ 698794658U, // <5,7,u,u>: Cost 1 vuzpr RHS, LHS
+ 1772544000U, // <5,u,0,0>: Cost 2 vuzpr RHS, <0,0,0,0>
+ 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1772544020U, // <5,u,0,2>: Cost 2 vuzpr RHS, <0,0,2,2>
+ 2111512578U, // <5,u,0,3>: Cost 2 ins <5,1,u,3>, lane 2
+ 2114838530U, // <5,u,0,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 2114183170U, // <5,u,0,5>: Cost 2 ins <5,5,u,5>, lane 2
+ 2113527810U, // <5,u,0,6>: Cost 2 ins <5,4,u,6>, lane 2
+ 2114199554U, // <5,u,0,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+ 2114805762U, // <5,u,1,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 1772544820U, // <5,u,1,1>: Cost 2 vuzpr RHS, <1,1,1,1>
+ 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 698802278U, // <5,u,1,3>: Cost 1 vuzpr RHS, LHS
+ 2114838530U, // <5,u,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 1843009690U, // <5,u,1,5>: Cost 2 vzipl <5,1,7,3>, RHS
+ 1980766362U, // <5,u,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS
+ 1908624712U, // <5,u,1,7>: Cost 2 vzipr <4,u,5,1>, RHS
+ 698802283U, // <5,u,1,u>: Cost 1 vuzpr RHS, LHS
+ 1772544918U, // <5,u,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+ 2128969728U, // <5,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 1772544164U, // <5,u,2,2>: Cost 2 vuzpr RHS, <0,2,0,2>
+ 1055244288U, // <5,u,2,3>: Cost 1 ins LHS, lane 0
+ 1772544922U, // <5,u,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+ 2129002496U, // <5,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 1772544204U, // <5,u,2,6>: Cost 2 vuzpr RHS, <0,2,4,6>
+ 2114199554U, // <5,u,2,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1055244288U, // <5,u,2,u>: Cost 1 ins LHS, lane 0
+ 2129698816U, // <5,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 1772545702U, // <5,u,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+ 2128388096U, // <5,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 1772544984U, // <5,u,3,3>: Cost 2 vuzpr RHS, <1,3,1,3>
+ 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+ 1772545742U, // <5,u,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+ 2113527810U, // <5,u,3,6>: Cost 2 ins <5,4,u,6>, lane 2
+ 1772545024U, // <5,u,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+ 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+ 2114805762U, // <5,u,4,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 1845024558U, // <5,u,4,1>: Cost 2 vzipl <5,4,7,6>, LHS
+ 2642897979U, // <5,u,4,2>: Cost 3 vext2 <4,2,5,u>, <4,2,5,u>
+ 2111512578U, // <5,u,4,3>: Cost 2 ins <5,1,u,3>, lane 2
+ 1772547280U, // <5,u,4,4>: Cost 2 vuzpr RHS, <4,4,4,4>
+ 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1772544348U, // <5,u,4,6>: Cost 2 vuzpr RHS, <0,4,2,6>
+ 2114199554U, // <5,u,4,7>: Cost 2 ins <5,5,u,7>, lane 2
+ 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+ 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+ 1845532462U, // <5,u,5,1>: Cost 2 vzipl <5,5,5,5>, LHS
+ 1979750190U, // <5,u,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS
+ 1908654236U, // <5,u,5,3>: Cost 2 vzipr <4,u,5,5>, LHS
+ 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+ 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+ 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+ 698805558U, // <5,u,5,7>: Cost 1 vuzpr RHS, RHS
+ 698805559U, // <5,u,5,u>: Cost 1 vuzpr RHS, RHS
+ 1772548194U, // <5,u,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+ 1846302510U, // <5,u,6,1>: Cost 2 vzipl <5,6,7,0>, LHS
+ 1772547404U, // <5,u,6,2>: Cost 2 vuzpr RHS, <4,6,0,2>
+ 1900036252U, // <5,u,6,3>: Cost 2 vzipr <3,4,5,6>, LHS
+ 1772548198U, // <5,u,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+ 1846302874U, // <5,u,6,5>: Cost 2 vzipl <5,6,7,0>, RHS
+ 1772547444U, // <5,u,6,6>: Cost 2 vuzpr RHS, <4,6,4,6>
+ 1058226176U, // <5,u,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <5,u,6,u>: Cost 1 ins RHS, lane 0
+ 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+ 2040971914U, // <5,u,7,2>: Cost 2 vtrnr RHS, <0,u,0,2>
+ 967230109U, // <5,u,7,3>: Cost 1 vtrnr RHS, LHS
+ 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+ 2040971926U, // <5,u,7,5>: Cost 2 vtrnr RHS, <0,u,1,5>
+ 118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+ 967233065U, // <5,u,7,7>: Cost 1 vtrnr RHS, RHS
+ 967230114U, // <5,u,7,u>: Cost 1 vtrnr RHS, LHS
+ 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+ 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+ 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+ 698802845U, // <5,u,u,3>: Cost 1 vuzpr RHS, LHS
+ 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+ 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+ 118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+ 698805801U, // <5,u,u,7>: Cost 1 vuzpr RHS, RHS
+ 698802850U, // <5,u,u,u>: Cost 1 vuzpr RHS, LHS
+ 2128150528U, // <6,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0
+ 2121523201U, // <6,0,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+ 1718206566U, // <6,0,0,2>: Cost 2 vuzpl <6,7,0,1>, LHS
+ 2852933922U, // <6,0,0,3>: Cost 3 vuzpr <5,6,7,0>, <6,0,1,3>
+ 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+ 2852934680U, // <6,0,0,5>: Cost 3 vuzpr <5,6,7,0>, <7,0,4,5>
+ 2852934690U, // <6,0,0,6>: Cost 3 vuzpr <5,6,7,0>, <7,0,5,6>
+ 2852933962U, // <6,0,0,7>: Cost 3 vuzpr <5,6,7,0>, <6,0,5,7>
+ 1718206620U, // <6,0,0,u>: Cost 2 vuzpl <6,7,0,1>, LHS
+ 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+ 2128232448U, // <6,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1779187814U, // <6,0,1,3>: Cost 2 vuzpr <5,6,7,0>, LHS
+ 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+ 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+ 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+ 2791949566U, // <6,0,1,7>: Cost 3 vuzpl <6,7,0,1>, <1,6,7,0>
+ 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504280678U, // <6,0,2,0>: Cost 2 vext1 <4,6,0,2>, LHS
+ 1849639014U, // <6,0,2,1>: Cost 2 vzipl <6,2,7,3>, LHS
+ 2128314368U, // <6,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2128322560U, // <6,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0
+ 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+ 2578026192U, // <6,0,2,5>: Cost 3 vext1 <4,6,0,2>, <5,1,7,3>
+ 2578026792U, // <6,0,2,6>: Cost 3 vext1 <4,6,0,2>, <6,0,2,0>
+ 2578027514U, // <6,0,2,7>: Cost 3 vext1 <4,6,0,2>, <7,0,1,2>
+ 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+ 3202113536U, // <6,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0
+ 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+ 2128388096U, // <6,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 2852930520U, // <6,0,3,3>: Cost 3 vuzpr <5,6,7,0>, <1,3,1,3>
+ 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+ 2852931278U, // <6,0,3,5>: Cost 3 vuzpr <5,6,7,0>, <2,3,4,5>
+ 3190587394U, // <6,0,3,6>: Cost 3 ins <6,0,u,6>, lane 2
+ 2852930560U, // <6,0,3,7>: Cost 3 vuzpr <5,6,7,0>, <1,3,5,7>
+ 2128388096U, // <6,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+ 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+ 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+ 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+ 3195576321U, // <6,0,4,3>: Cost 3 ins <6,u,4,3>, lane 1
+ 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+ 2121850881U, // <6,0,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+ 1718209846U, // <6,0,4,6>: Cost 2 vuzpl <6,7,0,1>, RHS
+ 3195609089U, // <6,0,4,7>: Cost 3 ins <6,u,4,7>, lane 1
+ 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+ 3202260992U, // <6,0,5,0>: Cost 3 ins <u,0,5,0>, lane 0
+ 2128527360U, // <6,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 3056156774U, // <6,0,5,2>: Cost 3 vtrnl <6,0,5,7>, LHS
+ 3190562818U, // <6,0,5,3>: Cost 3 ins <6,0,u,3>, lane 2
+ 3058802892U, // <6,0,5,4>: Cost 3 vtrnl <6,4,5,6>, <0,2,4,6>
+ 2852933636U, // <6,0,5,5>: Cost 3 vuzpr <5,6,7,0>, <5,5,5,5>
+ 2852932908U, // <6,0,5,6>: Cost 3 vuzpr <5,6,7,0>, <4,5,5,6>
+ 1779191094U, // <6,0,5,7>: Cost 2 vuzpr <5,6,7,0>, RHS
+ 1779191095U, // <6,0,5,u>: Cost 2 vuzpr <5,6,7,0>, RHS
+ 1779191906U, // <6,0,6,0>: Cost 2 vuzpr <5,6,7,0>, <5,6,7,0>
+ 1852244070U, // <6,0,6,1>: Cost 2 vzipl <6,6,6,6>, LHS
+ 1986461798U, // <6,0,6,2>: Cost 2 vtrnl <6,6,6,6>, LHS
+ 3195723777U, // <6,0,6,3>: Cost 3 ins <6,u,6,3>, lane 1
+ 2852933734U, // <6,0,6,4>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,4>
+ 3195740161U, // <6,0,6,5>: Cost 3 ins <6,u,6,5>, lane 1
+ 2122006529U, // <6,0,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+ 2128650240U, // <6,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0
+ 1852244637U, // <6,0,6,u>: Cost 2 vzipl <6,6,6,6>, LHS
+ 1906753536U, // <6,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0>
+ 1906755238U, // <6,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1>
+ 1906753700U, // <6,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2>
+ 2122055681U, // <6,0,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+ 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+ 2980496418U, // <6,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5>
+ 2980495690U, // <6,0,7,6>: Cost 3 vzipr RHS, <0,4,0,6>
+ 2122088449U, // <6,0,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+ 1906753706U, // <6,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u>
+ 1906761728U, // <6,0,u,0>: Cost 2 vzipr RHS, <0,0,0,0>
+ 1906763430U, // <6,0,u,1>: Cost 2 vzipr RHS, <2,3,0,1>
+ 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1779188381U, // <6,0,u,3>: Cost 2 vuzpr <5,6,7,0>, LHS
+ 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+ 2121850881U, // <6,0,u,5>: Cost 2 ins <6,u,4,5>, lane 1
+ 1718212762U, // <6,0,u,6>: Cost 2 vuzpl <6,7,0,1>, RHS
+ 1779191337U, // <6,0,u,7>: Cost 2 vuzpr <5,6,7,0>, RHS
+ 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+ 2121523201U, // <6,1,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+ 2846673046U, // <6,1,0,2>: Cost 3 vuzpr <4,6,3,1>, <3,0,1,2>
+ 2047623270U, // <6,1,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS
+ 2787385548U, // <6,1,0,4>: Cost 3 vuzpl <6,0,1,2>, <0,2,4,6>
+ 3060384768U, // <6,1,0,5>: Cost 3 vtrnl <6,7,0,1>, <1,3,5,7>
+ 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+ 3060385022U, // <6,1,0,7>: Cost 3 vtrnl <6,7,0,1>, <1,6,7,0>
+ 2047623275U, // <6,1,0,u>: Cost 2 vtrnr <5,6,7,0>, LHS
+ 2578088038U, // <6,1,1,0>: Cost 3 vext1 <4,6,1,1>, LHS
+ 2128896000U, // <6,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0
+ 2981778426U, // <6,1,1,2>: Cost 3 vzipr <4,7,6,1>, <7,0,1,2>
+ 2128912384U, // <6,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0
+ 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+ 3202670592U, // <6,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0
+ 2691482470U, // <6,1,1,6>: Cost 3 vext3 <1,1,6,6>, <1,1,6,6>
+ 2980449545U, // <6,1,1,7>: Cost 3 vzipr <4,5,6,1>, <4,5,1,7>
+ 2128896000U, // <6,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0
+ 2128961536U, // <6,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <6,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <6,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <6,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <6,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <6,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <6,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <6,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <6,1,2,u>: Cost 1 ins LHS, lane 0
+ 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+ 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+ 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+ 2129059840U, // <6,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0
+ 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+ 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+ 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+ 2953923849U, // <6,1,3,7>: Cost 3 vzipr <0,1,6,3>, <4,5,1,7>
+ 2129059840U, // <6,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0
+ 2788724044U, // <6,1,4,0>: Cost 3 vuzpl <6,2,1,3>, <4,6,0,2>
+ 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+ 3195568129U, // <6,1,4,2>: Cost 3 ins <6,u,4,2>, lane 1
+ 2047656038U, // <6,1,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS
+ 2791378292U, // <6,1,4,4>: Cost 3 vuzpl <6,6,1,3>, <4,6,4,6>
+ 2121850881U, // <6,1,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+ 2834506076U, // <6,1,4,6>: Cost 3 vuzpr <2,6,0,1>, <0,4,2,6>
+ 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+ 2047656043U, // <6,1,4,u>: Cost 2 vtrnr <5,6,7,4>, LHS
+ 2578120806U, // <6,1,5,0>: Cost 3 vext1 <4,6,1,5>, LHS
+ 2578121728U, // <6,1,5,1>: Cost 3 vext1 <4,6,1,5>, <1,3,5,7>
+ 3202940928U, // <6,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0
+ 2129207296U, // <6,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0
+ 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+ 3202965504U, // <6,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0
+ 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+ 2834509110U, // <6,1,5,7>: Cost 3 vuzpr <2,6,0,1>, RHS
+ 2129207296U, // <6,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0
+ 2925986550U, // <6,1,6,0>: Cost 3 vzipl <6,6,6,6>, <1,0,3,2>
+ 2834507673U, // <6,1,6,1>: Cost 3 vuzpr <2,6,0,1>, <2,6,0,1>
+ 2982480022U, // <6,1,6,2>: Cost 3 vzipr <4,u,6,6>, <3,0,1,2>
+ 2041479270U, // <6,1,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS
+ 2602020150U, // <6,1,6,4>: Cost 3 vext1 <u,6,1,6>, RHS
+ 2982478162U, // <6,1,6,5>: Cost 3 vzipr <4,u,6,6>, <0,4,1,5>
+ 2122006529U, // <6,1,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+ 2129313792U, // <6,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+ 2041479275U, // <6,1,6,u>: Cost 2 vtrnr <4,6,4,6>, LHS
+ 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+ 1906753546U, // <6,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1>
+ 1906755734U, // <6,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2>
+ 2029469798U, // <6,1,7,3>: Cost 2 vtrnr <2,6,3,7>, LHS
+ 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+ 1906753874U, // <6,1,7,5>: Cost 2 vzipr RHS, <0,4,1,5>
+ 2980495537U, // <6,1,7,6>: Cost 3 vzipr RHS, <0,2,1,6>
+ 2122088449U, // <6,1,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+ 2029469803U, // <6,1,7,u>: Cost 2 vtrnr <2,6,3,7>, LHS
+ 2128961536U, // <6,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 1906761738U, // <6,1,u,1>: Cost 2 vzipr RHS, <0,0,1,1>
+ 1906763926U, // <6,1,u,2>: Cost 2 vzipr RHS, <3,0,1,2>
+ 1055244288U, // <6,1,u,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <6,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 1906762066U, // <6,1,u,5>: Cost 2 vzipr RHS, <0,4,1,5>
+ 2129010688U, // <6,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2122088449U, // <6,1,u,7>: Cost 2 ins <6,u,7,7>, lane 1
+ 1055244288U, // <6,1,u,u>: Cost 1 ins LHS, lane 0
+ 2846457856U, // <6,2,0,0>: Cost 3 vuzpr <4,6,0,2>, <0,0,0,0>
+ 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2129494016U, // <6,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 2118148098U, // <6,2,0,3>: Cost 2 ins <6,2,u,3>, lane 2
+ 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+ 3195297793U, // <6,2,0,5>: Cost 3 ins <6,u,0,5>, lane 1
+ 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+ 3195314177U, // <6,2,0,7>: Cost 3 ins <6,u,0,7>, lane 1
+ 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+ 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+ 2846458676U, // <6,2,1,1>: Cost 3 vuzpr <4,6,0,2>, <1,1,1,1>
+ 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+ 1772716134U, // <6,2,1,3>: Cost 2 vuzpr <4,6,0,2>, LHS
+ 3191414787U, // <6,2,1,4>: Cost 3 ins <6,2,1,u>, lane 3
+ 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+ 3114885324U, // <6,2,1,6>: Cost 3 vtrnr <4,6,0,1>, <0,2,4,6>
+ 3191922690U, // <6,2,1,7>: Cost 3 ins <6,2,u,7>, lane 2
+ 1772716139U, // <6,2,1,u>: Cost 2 vuzpr <4,6,0,2>, LHS
+ 2846458774U, // <6,2,2,0>: Cost 3 vuzpr <4,6,0,2>, <1,2,3,0>
+ 3195412481U, // <6,2,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+ 2129641472U, // <6,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0
+ 1908703334U, // <6,2,2,3>: Cost 2 vzipr <4,u,6,2>, LHS
+ 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+ 3195445249U, // <6,2,2,5>: Cost 3 ins <6,u,2,5>, lane 1
+ 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+ 2846462444U, // <6,2,2,7>: Cost 3 vuzpr <4,6,0,2>, <6,2,5,7>
+ 1908703339U, // <6,2,2,u>: Cost 2 vzipr <4,u,6,2>, LHS
+ 2129698816U, // <6,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 2230618020U, // <6,2,3,1>: Cost 3 vrev <2,6,1,3>
+ 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+ 2129723392U, // <6,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0
+ 2129731584U, // <6,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 2846459598U, // <6,2,3,5>: Cost 3 vuzpr <4,6,0,2>, <2,3,4,5>
+ 2966528348U, // <6,2,3,6>: Cost 3 vzipr <2,2,6,3>, <0,4,2,6>
+ 2846458880U, // <6,2,3,7>: Cost 3 vuzpr <4,6,0,2>, <1,3,5,7>
+ 2129698816U, // <6,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0
+ 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+ 3191873538U, // <6,2,4,1>: Cost 3 ins <6,2,u,1>, lane 2
+ 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+ 2118148098U, // <6,2,4,3>: Cost 2 ins <6,2,u,3>, lane 2
+ 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+ 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 2129821696U, // <6,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 3195609089U, // <6,2,4,7>: Cost 3 ins <6,u,4,7>, lane 1
+ 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+ 3191709699U, // <6,2,5,0>: Cost 3 ins <6,2,5,u>, lane 3
+ 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+ 3203604480U, // <6,2,5,2>: Cost 3 ins <u,2,5,2>, lane 0
+ 2118148098U, // <6,2,5,3>: Cost 2 ins <6,2,u,3>, lane 2
+ 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+ 2846461956U, // <6,2,5,5>: Cost 3 vuzpr <4,6,0,2>, <5,5,5,5>
+ 3115213004U, // <6,2,5,6>: Cost 3 vtrnr <4,6,4,5>, <0,2,4,6>
+ 1772719414U, // <6,2,5,7>: Cost 2 vuzpr <4,6,0,2>, RHS
+ 1772719415U, // <6,2,5,u>: Cost 2 vuzpr <4,6,0,2>, RHS
+ 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+ 3195707393U, // <6,2,6,1>: Cost 3 ins <6,u,6,1>, lane 1
+ 1772719436U, // <6,2,6,2>: Cost 2 vuzpr <4,6,0,2>, <4,6,0,2>
+ 1908736102U, // <6,2,6,3>: Cost 2 vzipr <4,u,6,6>, LHS
+ 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+ 3195740161U, // <6,2,6,5>: Cost 3 ins <6,u,6,5>, lane 1
+ 2122006529U, // <6,2,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+ 2118189061U, // <6,2,6,7>: Cost 2 ins <6,2,u,u>, lane 5
+ 1908736107U, // <6,2,6,u>: Cost 2 vzipr <4,u,6,6>, LHS
+ 2118115331U, // <6,2,7,0>: Cost 2 ins <6,2,7,u>, lane 3
+ 2118115331U, // <6,2,7,1>: Cost 2 ins <6,2,7,u>, lane 3
+ 1906753556U, // <6,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2>
+ 833011814U, // <6,2,7,3>: Cost 1 vzipr RHS, LHS
+ 2118115331U, // <6,2,7,4>: Cost 2 ins <6,2,7,u>, lane 3
+ 2118115331U, // <6,2,7,5>: Cost 2 ins <6,2,7,u>, lane 3
+ 1906753884U, // <6,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6>
+ 2122088449U, // <6,2,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+ 833011819U, // <6,2,7,u>: Cost 1 vzipr RHS, LHS
+ 2129698816U, // <6,2,u,0>: Cost 2 ins <u,2,3,0>, lane 0
+ 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+ 1906761748U, // <6,2,u,2>: Cost 2 vzipr RHS, <0,0,2,2>
+ 833020006U, // <6,2,u,3>: Cost 1 vzipr RHS, LHS
+ 2129731584U, // <6,2,u,4>: Cost 2 ins <u,2,3,4>, lane 0
+ 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+ 1906762076U, // <6,2,u,6>: Cost 2 vzipr RHS, <0,4,2,6>
+ 1772719657U, // <6,2,u,7>: Cost 2 vuzpr <4,6,0,2>, RHS
+ 833020011U, // <6,2,u,u>: Cost 1 vzipr RHS, LHS
+ 3203883008U, // <6,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0
+ 2130149376U, // <6,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0
+ 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+ 3121365976U, // <6,3,0,3>: Cost 3 vtrnr <5,6,7,0>, <1,3,1,3>
+ 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+ 3121366734U, // <6,3,0,5>: Cost 3 vtrnr <5,6,7,0>, <2,3,4,5>
+ 3195305985U, // <6,3,0,6>: Cost 3 ins <6,u,0,6>, lane 1
+ 3121366016U, // <6,3,0,7>: Cost 3 vtrnr <5,6,7,0>, <1,3,5,7>
+ 2130149376U, // <6,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0
+ 2578235494U, // <6,3,1,0>: Cost 3 vext1 <4,6,3,1>, LHS
+ 3203964928U, // <6,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0
+ 3203973120U, // <6,3,1,2>: Cost 3 ins <u,3,1,2>, lane 0
+ 2130239488U, // <6,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0
+ 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+ 3203997696U, // <6,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0
+ 2822725737U, // <6,3,1,6>: Cost 3 vuzpr <0,6,2,3>, <0,1,2,6>
+ 2970494906U, // <6,3,1,7>: Cost 3 vzipr <2,u,6,1>, <2,6,3,7>
+ 2130239488U, // <6,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0
+ 2982445974U, // <6,3,2,0>: Cost 3 vzipr <4,u,6,2>, <1,2,3,0>
+ 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+ 2630985357U, // <6,3,2,2>: Cost 3 vext2 <2,2,6,3>, <2,2,6,3>
+ 2130313216U, // <6,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+ 2982445978U, // <6,3,2,4>: Cost 3 vzipr <4,u,6,2>, <1,2,3,4>
+ 3114895054U, // <6,3,2,5>: Cost 3 vtrnr <4,6,0,2>, <2,3,4,5>
+ 2834596044U, // <6,3,2,6>: Cost 3 vuzpr <2,6,1,3>, <0,2,4,6>
+ 3114894336U, // <6,3,2,7>: Cost 3 vtrnr <4,6,0,2>, <1,3,5,7>
+ 2130313216U, // <6,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+ 2578251878U, // <6,3,3,0>: Cost 3 vext1 <4,6,3,3>, LHS
+ 2792163478U, // <6,3,3,1>: Cost 3 vuzpl <6,7,3,0>, <3,0,1,2>
+ 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+ 2130386944U, // <6,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0
+ 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+ 2792196610U, // <6,3,3,5>: Cost 3 vuzpl <6,7,3,4>, <3,4,5,6>
+ 2590200602U, // <6,3,3,6>: Cost 3 vext1 <6,6,3,3>, <6,6,3,3>
+ 2972501946U, // <6,3,3,7>: Cost 3 vzipr <3,2,6,3>, <2,6,3,7>
+ 2130386944U, // <6,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0
+ 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+ 2705050078U, // <6,3,4,1>: Cost 3 vext3 <3,4,1,6>, <3,4,1,6>
+ 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+ 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+ 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+ 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+ 2846540124U, // <6,3,4,6>: Cost 3 vuzpr <4,6,1,3>, <0,4,2,6>
+ 3121398784U, // <6,3,4,7>: Cost 3 vtrnr <5,6,7,4>, <1,3,5,7>
+ 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+ 2578268262U, // <6,3,5,0>: Cost 3 vext1 <4,6,3,5>, LHS
+ 3204259840U, // <6,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0
+ 2648903448U, // <6,3,5,2>: Cost 3 vext2 <5,2,6,3>, <5,2,6,3>
+ 2578270722U, // <6,3,5,3>: Cost 3 vext1 <4,6,3,5>, <3,4,5,6>
+ 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+ 3204292608U, // <6,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0
+ 3204300800U, // <6,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0
+ 2130567168U, // <6,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0
+ 2130567168U, // <6,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0
+ 2982478742U, // <6,3,6,0>: Cost 3 vzipr <4,u,6,6>, <1,2,3,0>
+ 3115222694U, // <6,3,6,1>: Cost 3 vtrnr <4,6,4,6>, <2,3,0,1>
+ 2982478582U, // <6,3,6,2>: Cost 3 vzipr <4,u,6,6>, <1,0,3,2>
+ 1748984315U, // <6,3,6,3>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3>
+ 2982478746U, // <6,3,6,4>: Cost 3 vzipr <4,u,6,6>, <1,2,3,4>
+ 3115222734U, // <6,3,6,5>: Cost 3 vtrnr <4,6,4,6>, <2,3,4,5>
+ 2122006529U, // <6,3,6,6>: Cost 2 ins <6,u,6,6>, lane 1
+ 2130640896U, // <6,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+ 1748984315U, // <6,3,6,u>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3>
+ 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+ 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+ 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+ 1906754376U, // <6,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3>
+ 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+ 3103213262U, // <6,3,7,5>: Cost 3 vtrnr <2,6,3,7>, <2,3,4,5>
+ 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+ 1906754704U, // <6,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7>
+ 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+ 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2130149376U, // <6,3,u,1>: Cost 2 ins <u,3,0,1>, lane 0
+ 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+ 1906762568U, // <6,3,u,3>: Cost 2 vzipr RHS, <1,1,3,3>
+ 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+ 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+ 2122006529U, // <6,3,u,6>: Cost 2 ins <6,u,6,6>, lane 1
+ 1906762896U, // <6,3,u,7>: Cost 2 vzipr RHS, <1,5,3,7>
+ 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+ 2242465098U, // <6,4,0,0>: Cost 3 vrev <4,6,0,0>
+ 2121523201U, // <6,4,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+ 1718534246U, // <6,4,0,2>: Cost 2 vuzpl <6,7,4,5>, LHS
+ 3195281409U, // <6,4,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+ 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+ 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+ 1986645302U, // <6,4,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS
+ 3195314177U, // <6,4,0,7>: Cost 3 ins <6,u,0,7>, lane 1
+ 1986645320U, // <6,4,0,u>: Cost 2 vtrnl <6,7,0,1>, RHS
+ 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+ 2242547028U, // <6,4,1,1>: Cost 3 vrev <4,6,1,1>
+ 3204636672U, // <6,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0
+ 1779220582U, // <6,4,1,3>: Cost 2 vuzpr <5,6,7,4>, LHS
+ 3059813748U, // <6,4,1,4>: Cost 3 vtrnl <6,6,1,3>, <4,6,4,6>
+ 2130919424U, // <6,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+ 3102941532U, // <6,4,1,6>: Cost 3 vtrnr <2,6,0,1>, <0,4,2,6>
+ 2242989450U, // <6,4,1,7>: Cost 3 vrev <4,6,7,1>
+ 1779220587U, // <6,4,1,u>: Cost 2 vuzpr <5,6,7,4>, LHS
+ 1168739660U, // <6,4,2,0>: Cost 2 vrev <4,6,0,2>
+ 3195412481U, // <6,4,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+ 2242628958U, // <6,4,2,2>: Cost 3 vrev <4,6,2,2>
+ 2130976768U, // <6,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0
+ 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+ 1849642294U, // <6,4,2,5>: Cost 2 vzipl <6,2,7,3>, RHS
+ 2131001344U, // <6,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 3195461633U, // <6,4,2,7>: Cost 3 ins <6,u,2,7>, lane 1
+ 1169329556U, // <6,4,2,u>: Cost 2 vrev <4,6,u,2>
+ 3195478017U, // <6,4,3,0>: Cost 3 ins <6,u,3,0>, lane 1
+ 2242563414U, // <6,4,3,1>: Cost 3 vrev <4,6,1,3>
+ 2242637151U, // <6,4,3,2>: Cost 3 vrev <4,6,2,3>
+ 2242710888U, // <6,4,3,3>: Cost 3 vrev <4,6,3,3>
+ 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+ 2846623438U, // <6,4,3,5>: Cost 3 vuzpr <4,6,2,4>, <2,3,4,5>
+ 2965864652U, // <6,4,3,6>: Cost 3 vzipr <2,1,6,3>, <0,2,4,6>
+ 2852963328U, // <6,4,3,7>: Cost 3 vuzpr <5,6,7,4>, <1,3,5,7>
+ 2243079573U, // <6,4,3,u>: Cost 3 vrev <4,6,u,3>
+ 2242497870U, // <6,4,4,0>: Cost 3 vrev <4,6,0,4>
+ 2852967732U, // <6,4,4,1>: Cost 3 vuzpr <5,6,7,4>, <7,4,0,1>
+ 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+ 2852967014U, // <6,4,4,3>: Cost 3 vuzpr <5,6,7,4>, <6,4,1,3>
+ 2131132416U, // <6,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0
+ 2121850881U, // <6,4,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+ 1718537526U, // <6,4,4,6>: Cost 2 vuzpl <6,7,4,5>, RHS
+ 2852967054U, // <6,4,4,7>: Cost 3 vuzpr <5,6,7,4>, <6,4,5,7>
+ 1718537544U, // <6,4,4,u>: Cost 2 vuzpl <6,7,4,5>, RHS
+ 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+ 2242579800U, // <6,4,5,1>: Cost 3 vrev <4,6,1,5>
+ 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+ 2242727274U, // <6,4,5,3>: Cost 3 vrev <4,6,3,5>
+ 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+ 2131214336U, // <6,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+ 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1779223862U, // <6,4,5,7>: Cost 2 vuzpr <5,6,7,4>, RHS
+ 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+ 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+ 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+ 1169067380U, // <6,4,6,4>: Cost 2 vrev <4,6,4,6>
+ 1852247350U, // <6,4,6,5>: Cost 2 vzipl <6,6,6,6>, RHS
+ 1986465078U, // <6,4,6,6>: Cost 2 vtrnl <6,6,6,6>, RHS
+ 2131304448U, // <6,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0
+ 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+ 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+ 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+ 2980495398U, // <6,4,7,2>: Cost 3 vzipr RHS, <0,0,4,2>
+ 2122055681U, // <6,4,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+ 1906756816U, // <6,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4>
+ 1906755278U, // <6,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5>
+ 1906753740U, // <6,4,7,6>: Cost 2 vzipr RHS, <0,2,4,6>
+ 2122088449U, // <6,4,7,7>: Cost 2 ins <6,u,7,7>, lane 1
+ 1906753742U, // <6,4,7,u>: Cost 2 vzipr RHS, <0,2,4,u>
+ 1168788818U, // <6,4,u,0>: Cost 2 vrev <4,6,0,u>
+ 2121523201U, // <6,4,u,1>: Cost 2 ins <6,u,0,1>, lane 1
+ 1718540078U, // <6,4,u,2>: Cost 2 vuzpl <6,7,4,5>, LHS
+ 1779221149U, // <6,4,u,3>: Cost 2 vuzpr <5,6,7,4>, LHS
+ 1906765008U, // <6,4,u,4>: Cost 2 vzipr RHS, <4,4,4,4>
+ 1906763470U, // <6,4,u,5>: Cost 2 vzipr RHS, <2,3,4,5>
+ 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1779224105U, // <6,4,u,7>: Cost 2 vuzpr <5,6,7,4>, RHS
+ 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 3195256833U, // <6,5,0,0>: Cost 3 ins <6,u,0,0>, lane 1
+ 2121523201U, // <6,5,0,1>: Cost 2 ins <6,u,0,1>, lane 1
+ 2787721318U, // <6,5,0,2>: Cost 3 vuzpl <6,0,5,7>, LHS
+ 3195281409U, // <6,5,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+ 2790367436U, // <6,5,0,4>: Cost 3 vuzpl <6,4,5,6>, <0,2,4,6>
+ 3121369092U, // <6,5,0,5>: Cost 3 vtrnr <5,6,7,0>, <5,5,5,5>
+ 2980440578U, // <6,5,0,6>: Cost 3 vzipr <4,5,6,0>, <3,4,5,6>
+ 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+ 2047626551U, // <6,5,0,u>: Cost 2 vtrnr <5,6,7,0>, RHS
+ 2578382950U, // <6,5,1,0>: Cost 3 vext1 <4,6,5,1>, LHS
+ 3205292032U, // <6,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0
+ 3195346945U, // <6,5,1,2>: Cost 3 ins <6,u,1,2>, lane 1
+ 2834833510U, // <6,5,1,3>: Cost 3 vuzpr <2,6,4,5>, LHS
+ 2578386296U, // <6,5,1,4>: Cost 3 vext1 <4,6,5,1>, <4,6,5,1>
+ 2578387072U, // <6,5,1,5>: Cost 3 vext1 <4,6,5,1>, <5,7,1,3>
+ 2922205282U, // <6,5,1,6>: Cost 3 vzipl <6,1,0,3>, <5,6,7,0>
+ 2131599360U, // <6,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0
+ 2131599360U, // <6,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0
+ 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+ 2982448018U, // <6,5,2,1>: Cost 3 vzipr <4,u,6,2>, <4,0,5,1>
+ 3195420673U, // <6,5,2,2>: Cost 3 ins <6,u,2,2>, lane 1
+ 2131640320U, // <6,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+ 2578394489U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, <4,6,5,2>
+ 3114897412U, // <6,5,2,5>: Cost 3 vtrnr <4,6,0,2>, <5,5,5,5>
+ 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+ 2041154870U, // <6,5,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS
+ 2041154871U, // <6,5,2,u>: Cost 2 vtrnr <4,6,0,2>, RHS
+ 3195478017U, // <6,5,3,0>: Cost 3 ins <6,u,3,0>, lane 1
+ 3205439488U, // <6,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0
+ 3091164465U, // <6,5,3,2>: Cost 3 vtrnr <0,6,2,3>, <4,5,6,2>
+ 3195502593U, // <6,5,3,3>: Cost 3 ins <6,u,3,3>, lane 1
+ 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+ 3205472256U, // <6,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0
+ 2980465154U, // <6,5,3,6>: Cost 3 vzipr <4,5,6,3>, <3,4,5,6>
+ 2131746816U, // <6,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+ 2131746816U, // <6,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0
+ 2789051724U, // <6,5,4,0>: Cost 3 vuzpl <6,2,5,7>, <4,6,0,2>
+ 3060715648U, // <6,5,4,1>: Cost 3 vtrnl <6,7,4,5>, <5,7,1,3>
+ 3195568129U, // <6,5,4,2>: Cost 3 ins <6,u,4,2>, lane 1
+ 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+ 2791705972U, // <6,5,4,4>: Cost 3 vuzpl <6,6,5,7>, <4,6,4,6>
+ 2121850881U, // <6,5,4,5>: Cost 2 ins <6,u,4,5>, lane 1
+ 2834833756U, // <6,5,4,6>: Cost 3 vuzpr <2,6,4,5>, <0,4,2,6>
+ 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+ 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+ 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+ 3006363382U, // <6,5,5,1>: Cost 3 vzipr <u,u,6,5>, <u,0,5,1>
+ 3205595136U, // <6,5,5,2>: Cost 3 ins <u,5,5,2>, lane 0
+ 2980479105U, // <6,5,5,3>: Cost 3 vzipr <4,5,6,5>, <0,1,5,3>
+ 2578419068U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, <4,6,5,5>
+ 2131877888U, // <6,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0
+ 2979154434U, // <6,5,5,6>: Cost 3 vzipr <4,3,6,5>, <3,4,5,6>
+ 2131894272U, // <6,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0
+ 2131877888U, // <6,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0
+ 2131910656U, // <6,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <6,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <6,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <6,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <6,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <6,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <6,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <6,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <6,5,6,u>: Cost 1 ins RHS, lane 0
+ 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+ 1906756498U, // <6,5,7,1>: Cost 2 vzipr RHS, <4,0,5,1>
+ 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+ 2122055681U, // <6,5,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+ 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+ 1906756826U, // <6,5,7,5>: Cost 2 vzipr RHS, <4,4,5,5>
+ 1906756098U, // <6,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6>
+ 2029473078U, // <6,5,7,7>: Cost 2 vtrnr <2,6,3,7>, RHS
+ 2029473079U, // <6,5,7,u>: Cost 2 vtrnr <2,6,3,7>, RHS
+ 2131910656U, // <6,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 1906764690U, // <6,5,u,1>: Cost 2 vzipr RHS, <4,0,5,1>
+ 2131927040U, // <6,5,u,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2122055681U, // <6,5,u,3>: Cost 2 ins <6,u,7,3>, lane 1
+ 2131943424U, // <6,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 1906765018U, // <6,5,u,5>: Cost 2 vzipr RHS, <4,4,5,5>
+ 1906764290U, // <6,5,u,6>: Cost 2 vzipr RHS, <3,4,5,6>
+ 1058226176U, // <6,5,u,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <6,5,u,u>: Cost 1 ins RHS, lane 0
+ 2047627362U, // <6,6,0,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0>
+ 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 1718026342U, // <6,6,0,2>: Cost 2 vuzpl <6,6,6,6>, LHS
+ 3195281409U, // <6,6,0,3>: Cost 3 ins <6,u,0,3>, lane 1
+ 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+ 3195297793U, // <6,6,0,5>: Cost 3 ins <6,u,0,5>, lane 1
+ 2120826882U, // <6,6,0,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 2120835074U, // <6,6,0,7>: Cost 2 ins <6,6,u,7>, lane 2
+ 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+ 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+ 1906707760U, // <6,6,1,1>: Cost 2 vzipr <4,5,6,1>, <4,5,6,1>
+ 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+ 1773043814U, // <6,6,1,3>: Cost 2 vuzpr <4,6,4,6>, LHS
+ 3194068995U, // <6,6,1,4>: Cost 3 ins <6,6,1,u>, lane 3
+ 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+ 2120826882U, // <6,6,1,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 2120835074U, // <6,6,1,7>: Cost 2 ins <6,6,u,7>, lane 2
+ 1773043819U, // <6,6,1,u>: Cost 2 vuzpr <4,6,4,6>, LHS
+ 3114896750U, // <6,6,2,0>: Cost 3 vtrnr <4,6,0,2>, <4,6,4,0>
+ 3195412481U, // <6,6,2,1>: Cost 3 ins <6,u,2,1>, lane 1
+ 2041154892U, // <6,6,2,2>: Cost 2 vtrnr <4,6,0,2>, <4,6,0,2>
+ 2120843269U, // <6,6,2,3>: Cost 2 ins <6,6,u,u>, lane 5
+ 3114897510U, // <6,6,2,4>: Cost 3 vtrnr <4,6,0,2>, <5,6,7,4>
+ 3195445249U, // <6,6,2,5>: Cost 3 ins <6,u,2,5>, lane 1
+ 2120826882U, // <6,6,2,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 1908706614U, // <6,6,2,7>: Cost 2 vzipr <4,u,6,2>, RHS
+ 1908706615U, // <6,6,2,u>: Cost 2 vzipr <4,u,6,2>, RHS
+ 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+ 2846787238U, // <6,6,3,1>: Cost 3 vuzpr <4,6,4,6>, <2,3,0,1>
+ 3206111232U, // <6,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0
+ 1880178826U, // <6,6,3,3>: Cost 2 vzipr <0,1,6,3>, <0,1,6,3>
+ 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+ 2846787278U, // <6,6,3,5>: Cost 3 vuzpr <4,6,4,6>, <2,3,4,5>
+ 2120826882U, // <6,6,3,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 2132410368U, // <6,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0
+ 2132410368U, // <6,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0
+ 2846790288U, // <6,6,4,0>: Cost 3 vuzpr <4,6,4,6>, <6,4,6,0>
+ 3194527746U, // <6,6,4,1>: Cost 3 ins <6,6,u,1>, lane 2
+ 2846788778U, // <6,6,4,2>: Cost 3 vuzpr <4,6,4,6>, <4,4,0,2>
+ 3195576321U, // <6,6,4,3>: Cost 3 ins <6,u,4,3>, lane 1
+ 2047660134U, // <6,6,4,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4>
+ 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 1718029622U, // <6,6,4,6>: Cost 2 vuzpl <6,6,6,6>, RHS
+ 2120835074U, // <6,6,4,7>: Cost 2 ins <6,6,u,7>, lane 2
+ 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+ 3194363907U, // <6,6,5,0>: Cost 3 ins <6,6,5,u>, lane 3
+ 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+ 3206258688U, // <6,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0
+ 3194544130U, // <6,6,5,3>: Cost 3 ins <6,6,u,3>, lane 2
+ 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+ 1906740532U, // <6,6,5,5>: Cost 2 vzipr <4,5,6,5>, <4,5,6,5>
+ 2120826882U, // <6,6,5,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 1773047094U, // <6,6,5,7>: Cost 2 vuzpr <4,6,4,6>, RHS
+ 1773047095U, // <6,6,5,u>: Cost 2 vuzpr <4,6,4,6>, RHS
+ 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+ 2120695811U, // <6,6,6,1>: Cost 2 ins <6,6,6,u>, lane 3
+ 2120695811U, // <6,6,6,2>: Cost 2 ins <6,6,6,u>, lane 3
+ 2120695811U, // <6,6,6,3>: Cost 2 ins <6,6,6,u>, lane 3
+ 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+ 2120695811U, // <6,6,6,5>: Cost 2 ins <6,6,6,u>, lane 3
+ 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+ 1908739382U, // <6,6,6,7>: Cost 2 vzipr <4,u,6,6>, RHS
+ 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+ 2132647936U, // <6,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0
+ 2120769539U, // <6,6,7,1>: Cost 2 ins <6,6,7,u>, lane 3
+ 1908747164U, // <6,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2>
+ 2122055681U, // <6,6,7,3>: Cost 2 ins <6,u,7,3>, lane 1
+ 2132680704U, // <6,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0
+ 2120769539U, // <6,6,7,5>: Cost 2 ins <6,6,7,u>, lane 3
+ 1906758456U, // <6,6,7,6>: Cost 2 vzipr RHS, <6,6,6,6>
+ 833015094U, // <6,6,7,7>: Cost 1 vzipr RHS, RHS
+ 833015095U, // <6,6,7,u>: Cost 1 vzipr RHS, RHS
+ 2047627362U, // <6,6,u,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0>
+ 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+ 1906764700U, // <6,6,u,2>: Cost 2 vzipr RHS, <4,0,6,2>
+ 1773044381U, // <6,6,u,3>: Cost 2 vuzpr <4,6,4,6>, LHS
+ 2047660134U, // <6,6,u,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4>
+ 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+ 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+ 833023286U, // <6,6,u,7>: Cost 1 vzipr RHS, RHS
+ 833023287U, // <6,6,u,u>: Cost 1 vzipr RHS, RHS
+ 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2120916995U, // <6,7,0,3>: Cost 2 ins <6,7,0,u>, lane 3
+ 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+ 2120916995U, // <6,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3
+ 2120916995U, // <6,7,0,7>: Cost 2 ins <6,7,0,u>, lane 3
+ 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 1761034342U, // <6,7,1,3>: Cost 2 vuzpr <2,6,3,7>, LHS
+ 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+ 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+ 2121498626U, // <6,7,1,7>: Cost 2 ins <6,7,u,7>, lane 2
+ 1761034347U, // <6,7,1,u>: Cost 2 vuzpr <2,6,3,7>, LHS
+ 2121064451U, // <6,7,2,0>: Cost 2 ins <6,7,2,u>, lane 3
+ 2121449474U, // <6,7,2,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1059889156U, // <6,7,2,3>: Cost 1 ins LHS, lane 4
+ 2121064451U, // <6,7,2,4>: Cost 2 ins <6,7,2,u>, lane 3
+ 2121482242U, // <6,7,2,5>: Cost 2 ins <6,7,u,5>, lane 2
+ 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2121498626U, // <6,7,2,7>: Cost 2 ins <6,7,u,7>, lane 2
+ 1059889156U, // <6,7,2,u>: Cost 1 ins LHS, lane 4
+ 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2121449474U, // <6,7,3,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 2133696516U, // <6,7,3,2>: Cost 2 ins <u,u,3,2>, lane 4
+ 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2121482242U, // <6,7,3,5>: Cost 2 ins <6,7,u,5>, lane 2
+ 2834777789U, // <6,7,3,6>: Cost 3 vuzpr <2,6,3,7>, <2,3,2,6>
+ 2133737476U, // <6,7,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+ 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2121449474U, // <6,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 2121211907U, // <6,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3
+ 2121211907U, // <6,7,4,3>: Cost 2 ins <6,7,4,u>, lane 3
+ 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1573203276U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+ 2121211907U, // <6,7,4,7>: Cost 2 ins <6,7,4,u>, lane 3
+ 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+ 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+ 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+ 2121465858U, // <6,7,5,3>: Cost 2 ins <6,7,u,3>, lane 2
+ 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 1761037622U, // <6,7,5,7>: Cost 2 vuzpr <2,6,3,7>, RHS
+ 1761037623U, // <6,7,5,u>: Cost 2 vuzpr <2,6,3,7>, RHS
+ 2121359363U, // <6,7,6,0>: Cost 2 ins <6,7,6,u>, lane 3
+ 2121449474U, // <6,7,6,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2121465858U, // <6,7,6,3>: Cost 2 ins <6,7,u,3>, lane 2
+ 2121359363U, // <6,7,6,4>: Cost 2 ins <6,7,6,u>, lane 3
+ 2121482242U, // <6,7,6,5>: Cost 2 ins <6,7,u,5>, lane 2
+ 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1060216836U, // <6,7,6,7>: Cost 1 ins RHS, lane 4
+ 1060216836U, // <6,7,6,u>: Cost 1 ins RHS, lane 4
+ 1906757730U, // <6,7,7,0>: Cost 2 vzipr RHS, <5,6,7,0>
+ 2121449474U, // <6,7,7,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+ 1906758138U, // <6,7,7,3>: Cost 2 vzipr RHS, <6,2,7,3>
+ 1906757734U, // <6,7,7,4>: Cost 2 vzipr RHS, <5,6,7,4>
+ 2121482242U, // <6,7,7,5>: Cost 2 ins <6,7,u,5>, lane 2
+ 1906757574U, // <6,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6>
+ 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+ 1906757738U, // <6,7,7,u>: Cost 2 vzipr RHS, <5,6,7,u>
+ 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 1059889156U, // <6,7,u,3>: Cost 1 ins LHS, lane 4
+ 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 1060216836U, // <6,7,u,7>: Cost 1 ins RHS, lane 4
+ 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+ 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+ 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2047623837U, // <6,u,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS
+ 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+ 1986648218U, // <6,u,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS
+ 2047626793U, // <6,u,0,7>: Cost 2 vtrnr <5,6,7,0>, RHS
+ 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+ 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1761042534U, // <6,u,1,3>: Cost 2 vuzpr <2,6,3,u>, LHS
+ 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+ 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+ 2120826882U, // <6,u,1,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 2120835074U, // <6,u,1,7>: Cost 2 ins <6,6,u,7>, lane 2
+ 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+ 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+ 1849644846U, // <6,u,2,1>: Cost 2 vzipl <6,2,7,3>, LHS
+ 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1055244288U, // <6,u,2,3>: Cost 1 ins LHS, lane 0
+ 1504873876U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, <4,6,u,2>
+ 1849645210U, // <6,u,2,5>: Cost 2 vzipl <6,2,7,3>, RHS
+ 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+ 2041155113U, // <6,u,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS
+ 1055244288U, // <6,u,2,u>: Cost 1 ins LHS, lane 0
+ 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 2121449474U, // <6,u,3,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 2128388096U, // <6,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 2121482242U, // <6,u,3,5>: Cost 2 ins <6,7,u,5>, lane 2
+ 2120826882U, // <6,u,3,6>: Cost 2 ins <6,6,u,6>, lane 2
+ 2131746816U, // <6,u,3,7>: Cost 2 ins <u,5,3,7>, lane 0
+ 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2121449474U, // <6,u,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 1986975534U, // <6,u,4,2>: Cost 2 vtrnl <6,7,4,5>, LHS
+ 2047656605U, // <6,u,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS
+ 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+ 1571220812U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+ 2047659561U, // <6,u,4,7>: Cost 2 vtrnr <5,6,7,4>, RHS
+ 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+ 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+ 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+ 2118148098U, // <6,u,5,3>: Cost 2 ins <6,2,u,3>, lane 2
+ 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1761045814U, // <6,u,5,7>: Cost 2 vuzpr <2,6,3,u>, RHS
+ 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+ 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+ 1852249902U, // <6,u,6,1>: Cost 2 vzipl <6,6,6,6>, LHS
+ 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2041479837U, // <6,u,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS
+ 1504906648U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, <4,6,u,6>
+ 1852250266U, // <6,u,6,5>: Cost 2 vzipl <6,6,6,6>, RHS
+ 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+ 1058226176U, // <6,u,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <6,u,6,u>: Cost 1 ins RHS, lane 0
+ 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+ 1906753609U, // <6,u,7,1>: Cost 2 vzipr RHS, <0,0,u,1>
+ 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+ 833011868U, // <6,u,7,3>: Cost 1 vzipr RHS, LHS
+ 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+ 1906753937U, // <6,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5>
+ 1906753776U, // <6,u,7,6>: Cost 2 vzipr RHS, <0,2,u,6>
+ 833015112U, // <6,u,7,7>: Cost 1 vzipr RHS, RHS
+ 833011873U, // <6,u,7,u>: Cost 1 vzipr RHS, LHS
+ 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+ 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+ 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+ 833020060U, // <6,u,u,3>: Cost 1 vzipr RHS, LHS
+ 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+ 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+ 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+ 833023304U, // <6,u,u,7>: Cost 1 vzipr RHS, RHS
+ 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+ 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+ 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+ 2987152532U, // <7,0,0,3>: Cost 3 vzipr <5,6,7,0>, <7,2,0,3>
+ 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+ 2987152210U, // <7,0,0,5>: Cost 3 vzipr <5,6,7,0>, <6,7,0,5>
+ 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+ 2987152050U, // <7,0,0,7>: Cost 3 vzipr <5,6,7,0>, <6,5,0,7>
+ 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+ 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+ 2128232448U, // <7,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+ 2122317827U, // <7,0,1,3>: Cost 2 ins <7,0,1,u>, lane 3
+ 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+ 2122317827U, // <7,0,1,5>: Cost 2 ins <7,0,1,u>, lane 3
+ 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+ 2122317827U, // <7,0,1,7>: Cost 2 ins <7,0,1,u>, lane 3
+ 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+ 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+ 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+ 2128314368U, // <7,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 2122833925U, // <7,0,2,3>: Cost 2 ins <7,0,u,u>, lane 5
+ 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+ 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+ 2712060126U, // <7,0,2,6>: Cost 3 vext3 RHS, <0,2,6,6>
+ 3201433601U, // <7,0,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+ 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+ 2983854080U, // <7,0,3,0>: Cost 3 vzipr <5,1,7,3>, <0,0,0,0>
+ 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+ 2128388096U, // <7,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+ 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+ 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+ 3196559362U, // <7,0,3,6>: Cost 3 ins <7,0,u,6>, lane 2
+ 3201507329U, // <7,0,3,7>: Cost 3 ins <7,u,3,7>, lane 1
+ 2128388096U, // <7,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0
+ 2712060230U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,2>
+ 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+ 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+ 3201548289U, // <7,0,4,3>: Cost 3 ins <7,u,4,3>, lane 1
+ 2712060269U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,5>
+ 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 2651606348U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,0,2>
+ 3201581057U, // <7,0,4,7>: Cost 3 ins <7,u,4,7>, lane 1
+ 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+ 2647625340U, // <7,0,5,0>: Cost 3 vext2 <5,0,7,0>, <5,0,7,0>
+ 2128527360U, // <7,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0
+ 1991032934U, // <7,0,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS
+ 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+ 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+ 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+ 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+ 2847477046U, // <7,0,5,7>: Cost 3 vuzpr <4,7,5,0>, RHS
+ 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+ 2985869312U, // <7,0,6,0>: Cost 3 vzipr <5,4,7,6>, <0,0,0,0>
+ 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+ 2128609280U, // <7,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0
+ 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+ 3202367488U, // <7,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0
+ 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+ 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+ 2122833925U, // <7,0,6,7>: Cost 2 ins <7,0,u,u>, lane 5
+ 2128609280U, // <7,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0
+ 2847477192U, // <7,0,7,0>: Cost 3 vuzpr <4,7,5,0>, <4,7,5,0>
+ 1858961510U, // <7,0,7,1>: Cost 2 vzipl <7,7,7,7>, LHS
+ 1993179238U, // <7,0,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS
+ 3201769473U, // <7,0,7,3>: Cost 3 ins <7,u,7,3>, lane 1
+ 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+ 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+ 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+ 2128060417U, // <7,0,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+ 1858962077U, // <7,0,7,u>: Cost 2 vzipl <7,7,7,7>, LHS
+ 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+ 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+ 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+ 2122317827U, // <7,0,u,3>: Cost 2 ins <7,0,1,u>, lane 3
+ 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+ 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+ 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+ 2122317827U, // <7,0,u,7>: Cost 2 ins <7,0,1,u>, lane 3
+ 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+ 2712060634U, // <7,1,0,0>: Cost 3 vext3 RHS, <1,0,0,1>
+ 2128822272U, // <7,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0
+ 1719615590U, // <7,1,0,2>: Cost 2 vuzpl <7,0,1,2>, LHS
+ 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+ 2859062268U, // <7,1,0,4>: Cost 3 vuzpr <6,7,0,1>, <7,0,1,4>
+ 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+ 2859061568U, // <7,1,0,6>: Cost 3 vuzpr <6,7,0,1>, <6,0,4,6>
+ 3201286145U, // <7,1,0,7>: Cost 3 ins <7,u,0,7>, lane 1
+ 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+ 2712060714U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,0>
+ 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+ 2127577089U, // <7,1,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+ 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+ 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+ 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+ 2859057294U, // <7,1,1,7>: Cost 3 vuzpr <6,7,0,1>, <0,1,6,7>
+ 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+ 2128961536U, // <7,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0
+ 2128969728U, // <7,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128977920U, // <7,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0
+ 1055244288U, // <7,1,2,3>: Cost 1 ins LHS, lane 0
+ 2128994304U, // <7,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0
+ 2129002496U, // <7,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2129010688U, // <7,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0
+ 2129018880U, // <7,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0
+ 1055244288U, // <7,1,2,u>: Cost 1 ins LHS, lane 0
+ 1510998118U, // <7,1,3,0>: Cost 2 vext1 <5,7,1,3>, LHS
+ 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+ 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+ 2047869030U, // <7,1,3,3>: Cost 2 vtrnr <5,7,1,3>, LHS
+ 1511001398U, // <7,1,3,4>: Cost 2 vext1 <5,7,1,3>, RHS
+ 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+ 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+ 2983859604U, // <7,1,3,7>: Cost 3 vzipr <5,1,7,3>, <7,5,1,7>
+ 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+ 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+ 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+ 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+ 2129133568U, // <7,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0
+ 2859060432U, // <7,1,4,4>: Cost 3 vuzpr <6,7,0,1>, <4,4,4,4>
+ 2129149952U, // <7,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0
+ 1719618870U, // <7,1,4,6>: Cost 2 vuzpl <7,0,1,2>, RHS
+ 2793360778U, // <7,1,4,7>: Cost 3 vuzpl <7,0,1,2>, <4,6,7,1>
+ 1719618888U, // <7,1,4,u>: Cost 2 vuzpl <7,0,1,2>, RHS
+ 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+ 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+ 3202940928U, // <7,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0
+ 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+ 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+ 2985861458U, // <7,1,5,5>: Cost 3 vzipr <5,4,7,5>, <0,4,1,5>
+ 2127904769U, // <7,1,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 1785318710U, // <7,1,5,7>: Cost 2 vuzpr <6,7,0,1>, RHS
+ 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+ 2653606230U, // <7,1,6,0>: Cost 3 vext2 <6,0,7,1>, <6,0,7,1>
+ 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+ 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+ 2129281024U, // <7,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0
+ 2859061350U, // <7,1,6,4>: Cost 3 vuzpr <6,7,0,1>, <5,6,7,4>
+ 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+ 2859060596U, // <7,1,6,6>: Cost 3 vuzpr <6,7,0,1>, <4,6,4,6>
+ 2129313792U, // <7,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0
+ 2129281024U, // <7,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0
+ 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+ 1785320270U, // <7,1,7,1>: Cost 2 vuzpr <6,7,0,1>, <6,7,0,1>
+ 2986543254U, // <7,1,7,2>: Cost 3 vzipr <5,5,7,7>, <3,0,1,2>
+ 2048196710U, // <7,1,7,3>: Cost 2 vtrnr <5,7,5,7>, LHS
+ 2793362538U, // <7,1,7,4>: Cost 3 vuzpl <7,0,1,2>, <7,1,4,6>
+ 2986541394U, // <7,1,7,5>: Cost 3 vzipr <5,5,7,7>, <0,4,1,5>
+ 3201794049U, // <7,1,7,6>: Cost 3 ins <7,u,7,6>, lane 1
+ 2128060417U, // <7,1,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+ 2048196715U, // <7,1,7,u>: Cost 2 vtrnr <5,7,5,7>, LHS
+ 1511039078U, // <7,1,u,0>: Cost 2 vext1 <5,7,1,u>, LHS
+ 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+ 1719621422U, // <7,1,u,2>: Cost 2 vuzpl <7,0,1,2>, LHS
+ 1055244288U, // <7,1,u,3>: Cost 1 ins LHS, lane 0
+ 1511042358U, // <7,1,u,4>: Cost 2 vext1 <5,7,1,u>, RHS
+ 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+ 1719621786U, // <7,1,u,6>: Cost 2 vuzpl <7,0,1,2>, RHS
+ 1785318953U, // <7,1,u,7>: Cost 2 vuzpr <6,7,0,1>, RHS
+ 1055244288U, // <7,1,u,u>: Cost 1 ins LHS, lane 0
+ 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+ 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+ 2129494016U, // <7,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 1913405542U, // <7,2,0,3>: Cost 2 vzipr <5,6,7,0>, LHS
+ 2712061400U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,2>
+ 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+ 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+ 2927577066U, // <7,2,0,7>: Cost 3 vzipl <7,0,1,2>, <2,7,0,1>
+ 1913405547U, // <7,2,0,u>: Cost 2 vzipr <5,6,7,0>, LHS
+ 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+ 3203301376U, // <7,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0
+ 2127577089U, // <7,2,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 2974548070U, // <7,2,1,3>: Cost 3 vzipr <3,5,7,1>, LHS
+ 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+ 3203334144U, // <7,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0
+ 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+ 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+ 2127577089U, // <7,2,1,u>: Cost 2 ins <7,u,1,2>, lane 1
+ 2712061524U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,0>
+ 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+ 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+ 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+ 2712061564U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,4>
+ 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+ 2712061581U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,3>
+ 3201433601U, // <7,2,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+ 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+ 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+ 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+ 1638319802U, // <7,2,3,2>: Cost 2 vext3 RHS, <2,3,2,3>
+ 1910112358U, // <7,2,3,3>: Cost 2 vzipr <5,1,7,3>, LHS
+ 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+ 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+ 1625048802U, // <7,2,3,6>: Cost 2 vext3 <2,3,6,7>, <2,3,6,7>
+ 2990495214U, // <7,2,3,7>: Cost 3 vzipr <6,2,7,3>, <7,6,2,7>
+ 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+ 2712061688U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,2>
+ 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+ 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+ 1913438310U, // <7,2,4,3>: Cost 2 vzipr <5,6,7,4>, LHS
+ 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+ 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+ 2129821696U, // <7,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 3201581057U, // <7,2,4,7>: Cost 3 ins <7,u,4,7>, lane 1
+ 1913438315U, // <7,2,4,u>: Cost 2 vzipr <5,6,7,4>, LHS
+ 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+ 3203596288U, // <7,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0
+ 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+ 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+ 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+ 3203629056U, // <7,2,5,5>: Cost 3 ins <u,2,5,5>, lane 0
+ 2127904769U, // <7,2,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 2853096758U, // <7,2,5,7>: Cost 3 vuzpr <5,7,0,2>, RHS
+ 2127904769U, // <7,2,5,u>: Cost 2 ins <7,u,5,6>, lane 1
+ 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+ 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+ 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+ 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+ 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+ 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+ 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+ 2129977344U, // <7,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0
+ 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+ 3121939350U, // <7,2,7,0>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,0>
+ 3203743744U, // <7,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0
+ 1720366165U, // <7,2,7,2>: Cost 2 vuzpl <7,1,2,3>, <7,1,2,3>
+ 1912799334U, // <7,2,7,3>: Cost 2 vzipr <5,5,7,7>, LHS
+ 3121939354U, // <7,2,7,4>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,4>
+ 3203776512U, // <7,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0
+ 2986541404U, // <7,2,7,6>: Cost 3 vzipr <5,5,7,7>, <0,4,2,6>
+ 2128060417U, // <7,2,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+ 1912799339U, // <7,2,7,u>: Cost 2 vzipr <5,5,7,7>, LHS
+ 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+ 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+ 2129494016U, // <7,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0
+ 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+ 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+ 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+ 2129821696U, // <7,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0
+ 2129977344U, // <7,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0
+ 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+ 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+ 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+ 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+ 2712062119U, // <7,3,0,3>: Cost 3 vext3 RHS, <3,0,3,1>
+ 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+ 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+ 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+ 2985157776U, // <7,3,0,7>: Cost 3 vzipr <5,3,7,0>, <1,5,3,7>
+ 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+ 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+ 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+ 2127577089U, // <7,3,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 1779433574U, // <7,3,1,3>: Cost 2 vuzpr <5,7,1,3>, LHS
+ 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+ 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+ 2853179064U, // <7,3,1,6>: Cost 3 vuzpr <5,7,1,3>, <5,1,4,6>
+ 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+ 1779433579U, // <7,3,1,u>: Cost 2 vuzpr <5,7,1,3>, LHS
+ 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+ 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+ 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+ 2130313216U, // <7,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0
+ 2712062292U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,3>
+ 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+ 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+ 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+ 2130313216U, // <7,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0
+ 2712062334U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,0>
+ 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+ 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+ 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+ 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+ 2990491658U, // <7,3,3,6>: Cost 3 vzipr <6,2,7,3>, <2,7,3,6>
+ 2972574864U, // <7,3,3,7>: Cost 3 vzipr <3,2,7,3>, <1,5,3,7>
+ 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+ 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+ 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+ 2987180790U, // <7,3,4,2>: Cost 3 vzipr <5,6,7,4>, <1,0,3,2>
+ 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+ 2712062455U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,4>
+ 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+ 2648313164U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,0,2>
+ 2985190544U, // <7,3,4,7>: Cost 3 vzipr <5,3,7,4>, <1,5,3,7>
+ 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+ 2712062498U, // <7,3,5,0>: Cost 3 vext3 RHS, <3,5,0,2>
+ 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+ 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+ 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+ 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+ 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+ 2127904769U, // <7,3,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 1779436854U, // <7,3,5,7>: Cost 2 vuzpr <5,7,1,3>, RHS
+ 1779436855U, // <7,3,5,u>: Cost 2 vuzpr <5,7,1,3>, RHS
+ 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+ 2853178744U, // <7,3,6,1>: Cost 3 vuzpr <5,7,1,3>, <4,6,5,1>
+ 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+ 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+ 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+ 3204366336U, // <7,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0
+ 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+ 2130640896U, // <7,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0
+ 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+ 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+ 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+ 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+ 1779437696U, // <7,3,7,3>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3>
+ 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+ 2237582070U, // <7,3,7,5>: Cost 3 vrev <3,7,5,7>
+ 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+ 2128060417U, // <7,3,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+ 1779437696U, // <7,3,7,u>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3>
+ 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+ 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+ 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+ 1779434141U, // <7,3,u,3>: Cost 2 vuzpr <5,7,1,3>, LHS
+ 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+ 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+ 2127904769U, // <7,3,u,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 1779437097U, // <7,3,u,7>: Cost 2 vuzpr <5,7,1,3>, RHS
+ 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+ 2714053478U, // <7,4,0,0>: Cost 3 vext3 RHS, <4,0,0,2>
+ 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+ 3201253377U, // <7,4,0,3>: Cost 3 ins <7,u,0,3>, lane 1
+ 2714053512U, // <7,4,0,4>: Cost 3 vext3 RHS, <4,0,4,0>
+ 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 2927578568U, // <7,4,0,7>: Cost 3 vzipl <7,0,1,2>, <4,7,5,0>
+ 1640311726U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,2>
+ 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+ 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+ 2127577089U, // <7,4,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+ 3127495888U, // <7,4,1,4>: Cost 3 vtrnr <6,7,0,1>, <4,4,4,4>
+ 2130919424U, // <7,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0
+ 1988054326U, // <7,4,1,6>: Cost 2 vtrnl <7,0,1,2>, RHS
+ 3061796234U, // <7,4,1,7>: Cost 3 vtrnl <7,0,1,2>, <4,6,7,1>
+ 1988054344U, // <7,4,1,u>: Cost 2 vtrnl <7,0,1,2>, RHS
+ 3204694016U, // <7,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0
+ 3199172610U, // <7,4,2,1>: Cost 3 ins <7,4,u,1>, lane 2
+ 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+ 2125488133U, // <7,4,2,3>: Cost 2 ins <7,4,u,u>, lane 5
+ 2853258138U, // <7,4,2,4>: Cost 3 vuzpr <5,7,2,4>, <1,2,3,4>
+ 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+ 2131001344U, // <7,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 3201433601U, // <7,4,2,7>: Cost 3 ins <7,u,2,7>, lane 1
+ 2125488133U, // <7,4,2,u>: Cost 2 ins <7,4,u,u>, lane 5
+ 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+ 3201458177U, // <7,4,3,1>: Cost 3 ins <7,u,3,1>, lane 1
+ 3204784128U, // <7,4,3,2>: Cost 3 ins <u,4,3,2>, lane 0
+ 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+ 2983857360U, // <7,4,3,4>: Cost 3 vzipr <5,1,7,3>, <4,4,4,4>
+ 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+ 2125471746U, // <7,4,3,6>: Cost 2 ins <7,4,u,6>, lane 2
+ 3201507329U, // <7,4,3,7>: Cost 3 ins <7,u,3,7>, lane 1
+ 2125471746U, // <7,4,3,u>: Cost 2 ins <7,4,u,6>, lane 2
+ 2714053800U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,0>
+ 3201531905U, // <7,4,4,1>: Cost 3 ins <7,u,4,1>, lane 1
+ 3201540097U, // <7,4,4,2>: Cost 3 ins <7,u,4,2>, lane 1
+ 2987185336U, // <7,4,4,3>: Cost 3 vzipr <5,6,7,4>, <7,2,4,3>
+ 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+ 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2987185664U, // <7,4,4,7>: Cost 3 vzipr <5,6,7,4>, <7,6,4,7>
+ 1640312054U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,6>
+ 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+ 2125266947U, // <7,4,5,1>: Cost 2 ins <7,4,5,u>, lane 3
+ 2125266947U, // <7,4,5,2>: Cost 2 ins <7,4,5,u>, lane 3
+ 2125266947U, // <7,4,5,3>: Cost 2 ins <7,4,5,u>, lane 3
+ 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+ 2131214336U, // <7,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0
+ 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+ 2125266947U, // <7,4,5,7>: Cost 2 ins <7,4,5,u>, lane 3
+ 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+ 1638468940U, // <7,4,6,0>: Cost 2 vext3 RHS, <4,6,0,2>
+ 2712063318U, // <7,4,6,1>: Cost 3 vext3 RHS, <4,6,1,3>
+ 2712210780U, // <7,4,6,2>: Cost 3 vext3 RHS, <4,6,2,0>
+ 2712210790U, // <7,4,6,3>: Cost 3 vext3 RHS, <4,6,3,1>
+ 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+ 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+ 2131296256U, // <7,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0
+ 2125488133U, // <7,4,6,7>: Cost 2 ins <7,4,u,u>, lane 5
+ 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+ 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+ 2794279930U, // <7,4,7,1>: Cost 3 vuzpl <7,1,4,6>, <7,0,1,2>
+ 3201761281U, // <7,4,7,2>: Cost 3 ins <7,u,7,2>, lane 1
+ 3201769473U, // <7,4,7,3>: Cost 3 ins <7,u,7,3>, lane 1
+ 2847509964U, // <7,4,7,4>: Cost 3 vuzpr <4,7,5,4>, <4,7,5,4>
+ 1858964790U, // <7,4,7,5>: Cost 2 vzipl <7,7,7,7>, RHS
+ 1993182518U, // <7,4,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS
+ 2128060417U, // <7,4,7,7>: Cost 2 ins <7,u,7,7>, lane 1
+ 1858965033U, // <7,4,7,u>: Cost 2 vzipl <7,7,7,7>, RHS
+ 1640312302U, // <7,4,u,0>: Cost 2 vext3 RHS, <4,u,0,2>
+ 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+ 2127577089U, // <7,4,u,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 2125488133U, // <7,4,u,3>: Cost 2 ins <7,4,u,u>, lane 5
+ 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+ 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+ 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+ 2125266947U, // <7,4,u,7>: Cost 2 ins <7,4,5,u>, lane 3
+ 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+ 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+ 2131476480U, // <7,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0
+ 1722597478U, // <7,5,0,2>: Cost 2 vuzpl <7,4,5,6>, LHS
+ 3201253377U, // <7,5,0,3>: Cost 3 ins <7,u,0,3>, lane 1
+ 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+ 2987150554U, // <7,5,0,5>: Cost 3 vzipr <5,6,7,0>, <4,4,5,5>
+ 2987149826U, // <7,5,0,6>: Cost 3 vzipr <5,6,7,0>, <3,4,5,6>
+ 2131525632U, // <7,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0
+ 1722597532U, // <7,5,0,u>: Cost 2 vuzpl <7,4,5,6>, LHS
+ 2714054287U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+ 2249183358U, // <7,5,1,1>: Cost 3 vrev <5,7,1,1>
+ 2127577089U, // <7,5,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 1785643110U, // <7,5,1,3>: Cost 2 vuzpr <6,7,4,5>, LHS
+ 2714054327U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+ 3127496708U, // <7,5,1,5>: Cost 3 vtrnr <6,7,0,1>, <5,5,5,5>
+ 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+ 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+ 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+ 2249117814U, // <7,5,2,0>: Cost 3 vrev <5,7,0,2>
+ 2714054379U, // <7,5,2,1>: Cost 3 vext3 RHS, <5,2,1,3>
+ 2249265288U, // <7,5,2,2>: Cost 3 vrev <5,7,2,2>
+ 2131640320U, // <7,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0
+ 2859385754U, // <7,5,2,4>: Cost 3 vuzpr <6,7,4,5>, <1,2,3,4>
+ 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+ 2712063768U, // <7,5,2,6>: Cost 3 vext3 RHS, <5,2,6,3>
+ 2131673088U, // <7,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0
+ 2131640320U, // <7,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0
+ 3201449985U, // <7,5,3,0>: Cost 3 ins <7,u,3,0>, lane 1
+ 1175457920U, // <7,5,3,1>: Cost 2 vrev <5,7,1,3>
+ 2249273481U, // <7,5,3,2>: Cost 3 vrev <5,7,2,3>
+ 2249347218U, // <7,5,3,3>: Cost 3 vrev <5,7,3,3>
+ 3201482753U, // <7,5,3,4>: Cost 3 ins <7,u,3,4>, lane 1
+ 2983857370U, // <7,5,3,5>: Cost 3 vzipr <5,1,7,3>, <4,4,5,5>
+ 2983856642U, // <7,5,3,6>: Cost 3 vzipr <5,1,7,3>, <3,4,5,6>
+ 2047872310U, // <7,5,3,7>: Cost 2 vtrnr <5,7,1,3>, RHS
+ 2047872311U, // <7,5,3,u>: Cost 2 vtrnr <5,7,1,3>, RHS
+ 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+ 2987182994U, // <7,5,4,1>: Cost 3 vzipr <5,6,7,4>, <4,0,5,1>
+ 2249281674U, // <7,5,4,2>: Cost 3 vrev <5,7,2,4>
+ 3201548289U, // <7,5,4,3>: Cost 3 ins <7,u,4,3>, lane 1
+ 2579074508U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, <4,7,5,4>
+ 2131804160U, // <7,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0
+ 1722600758U, // <7,5,4,6>: Cost 2 vuzpl <7,4,5,6>, RHS
+ 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+ 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+ 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+ 2714054620U, // <7,5,5,1>: Cost 3 vext3 RHS, <5,5,1,1>
+ 3201613825U, // <7,5,5,2>: Cost 3 ins <7,u,5,2>, lane 1
+ 2649657204U, // <7,5,5,3>: Cost 3 vext2 <5,3,7,5>, <5,3,7,5>
+ 2714054651U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,5>
+ 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+ 2127904769U, // <7,5,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+ 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+ 2131910656U, // <7,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0
+ 2131918848U, // <7,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 2131927040U, // <7,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0
+ 2131935232U, // <7,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0
+ 2131943424U, // <7,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0
+ 2131951616U, // <7,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 2131959808U, // <7,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0
+ 1058226176U, // <7,5,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <7,5,6,u>: Cost 1 ins RHS, lane 0
+ 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+ 1638469760U, // <7,5,7,1>: Cost 2 vext3 RHS, <5,7,1,3>
+ 2712211590U, // <7,5,7,2>: Cost 3 vext3 RHS, <5,7,2,0>
+ 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+ 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+ 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+ 2048199990U, // <7,5,7,7>: Cost 2 vtrnr <5,7,5,7>, RHS
+ 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+ 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+ 1638469841U, // <7,5,u,1>: Cost 2 vext3 RHS, <5,u,1,3>
+ 1722603310U, // <7,5,u,2>: Cost 2 vuzpl <7,4,5,6>, LHS
+ 1785643677U, // <7,5,u,3>: Cost 2 vuzpr <6,7,4,5>, LHS
+ 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+ 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+ 1722603674U, // <7,5,u,6>: Cost 2 vuzpl <7,4,5,6>, RHS
+ 1058226176U, // <7,5,u,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <7,5,u,u>: Cost 1 ins RHS, lane 0
+ 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+ 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2132148224U, // <7,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+ 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+ 2987151292U, // <7,6,0,5>: Cost 3 vzipr <5,6,7,0>, <5,4,6,5>
+ 2987150564U, // <7,6,0,6>: Cost 3 vzipr <5,6,7,0>, <4,4,6,6>
+ 1913408822U, // <7,6,0,7>: Cost 2 vzipr <5,6,7,0>, RHS
+ 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+ 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+ 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+ 2127577089U, // <7,6,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 2841329766U, // <7,6,1,3>: Cost 3 vuzpr <3,7,2,6>, LHS
+ 2579123666U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, <4,7,6,1>
+ 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+ 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+ 2974551350U, // <7,6,1,7>: Cost 3 vzipr <3,5,7,1>, RHS
+ 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+ 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+ 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+ 2714055117U, // <7,6,2,2>: Cost 3 vext3 RHS, <6,2,2,3>
+ 2132303872U, // <7,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0
+ 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+ 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+ 2714055152U, // <7,6,2,6>: Cost 3 vext3 RHS, <6,2,6,2>
+ 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+ 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+ 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+ 3121614200U, // <7,6,3,1>: Cost 3 vtrnr <5,7,1,3>, <4,6,5,1>
+ 1181504354U, // <7,6,3,2>: Cost 2 vrev <6,7,2,3>
+ 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+ 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+ 3206135808U, // <7,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0
+ 2983857380U, // <7,6,3,6>: Cost 3 vzipr <5,1,7,3>, <4,4,6,6>
+ 1910115638U, // <7,6,3,7>: Cost 2 vzipr <5,1,7,3>, RHS
+ 1910115639U, // <7,6,3,u>: Cost 2 vzipr <5,1,7,3>, RHS
+ 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+ 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+ 2714055276U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,0>
+ 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+ 2650328272U, // <7,6,4,4>: Cost 3 vext2 <5,4,7,6>, <4,4,4,4>
+ 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2132475904U, // <7,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 1913441590U, // <7,6,4,7>: Cost 2 vzipr <5,6,7,4>, RHS
+ 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+ 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+ 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+ 3201622017U, // <7,6,5,3>: Cost 3 ins <7,u,5,3>, lane 1
+ 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+ 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+ 2127904769U, // <7,6,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 2971929910U, // <7,6,5,7>: Cost 3 vzipr <3,1,7,5>, RHS
+ 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+ 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+ 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+ 2712212245U, // <7,6,6,2>: Cost 3 vext3 RHS, <6,6,2,7>
+ 3201695745U, // <7,6,6,3>: Cost 3 ins <7,u,6,3>, lane 1
+ 2714055461U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,5>
+ 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+ 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+ 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+ 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+ 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+ 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+ 1638323042U, // <7,6,7,2>: Cost 2 vext3 RHS, <6,7,2,3>
+ 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+ 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+ 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+ 1638323082U, // <7,6,7,6>: Cost 2 vext3 RHS, <6,7,6,7>
+ 1912802614U, // <7,6,7,7>: Cost 2 vzipr <5,5,7,7>, RHS
+ 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+ 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+ 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+ 2132148224U, // <7,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0
+ 2132303872U, // <7,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0
+ 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+ 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+ 2132475904U, // <7,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0
+ 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+ 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+ 1913409634U, // <7,7,0,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0>
+ 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+ 1724743782U, // <7,7,0,2>: Cost 2 vuzpl <7,7,7,7>, LHS
+ 2987151056U, // <7,7,0,3>: Cost 3 vzipr <5,6,7,0>, <5,1,7,3>
+ 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+ 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+ 2987151302U, // <7,7,0,6>: Cost 3 vzipr <5,6,7,0>, <5,4,7,6>
+ 2127470594U, // <7,7,0,7>: Cost 2 ins <7,7,u,7>, lane 2
+ 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+ 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+ 2053755726U, // <7,7,1,1>: Cost 2 vtrnr <6,7,0,1>, <6,7,0,1>
+ 2127577089U, // <7,7,1,2>: Cost 2 ins <7,u,1,2>, lane 1
+ 1779761254U, // <7,7,1,3>: Cost 2 vuzpr <5,7,5,7>, LHS
+ 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+ 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+ 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+ 2127470594U, // <7,7,1,7>: Cost 2 ins <7,7,u,7>, lane 2
+ 1779761259U, // <7,7,1,u>: Cost 2 vuzpr <5,7,5,7>, LHS
+ 2853503894U, // <7,7,2,0>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,0>
+ 3206692864U, // <7,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0
+ 1988801621U, // <7,7,2,2>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3>
+ 2132967424U, // <7,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0
+ 2853503898U, // <7,7,2,4>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,4>
+ 3206725632U, // <7,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0
+ 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+ 2127470594U, // <7,7,2,7>: Cost 2 ins <7,7,u,7>, lane 2
+ 1988801621U, // <7,7,2,u>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3>
+ 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+ 3121615694U, // <7,7,3,1>: Cost 3 vtrnr <5,7,1,3>, <6,7,0,1>
+ 3201171458U, // <7,7,3,2>: Cost 3 ins <7,7,u,2>, lane 2
+ 1910116048U, // <7,7,3,3>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3>
+ 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+ 2639055462U, // <7,7,3,5>: Cost 3 vext2 <3,5,7,7>, <3,5,7,7>
+ 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+ 2127470594U, // <7,7,3,7>: Cost 2 ins <7,7,u,7>, lane 2
+ 1910116048U, // <7,7,3,u>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3>
+ 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+ 3062715386U, // <7,7,4,1>: Cost 3 vtrnl <7,1,4,6>, <7,0,1,2>
+ 3201540097U, // <7,7,4,2>: Cost 3 ins <7,u,4,2>, lane 1
+ 2987183824U, // <7,7,4,3>: Cost 3 vzipr <5,6,7,4>, <5,1,7,3>
+ 1913442406U, // <7,7,4,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4>
+ 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+ 1724747062U, // <7,7,4,6>: Cost 2 vuzpl <7,7,7,7>, RHS
+ 2127470594U, // <7,7,4,7>: Cost 2 ins <7,7,u,7>, lane 2
+ 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+ 2853508547U, // <7,7,5,0>: Cost 3 vuzpr <5,7,5,7>, <7,5,7,0>
+ 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+ 3201613825U, // <7,7,5,2>: Cost 3 ins <7,u,5,2>, lane 1
+ 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+ 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+ 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+ 2127904769U, // <7,7,5,6>: Cost 2 ins <7,u,5,6>, lane 1
+ 1779764534U, // <7,7,5,7>: Cost 2 vuzpr <5,7,5,7>, RHS
+ 1779764535U, // <7,7,5,u>: Cost 2 vuzpr <5,7,5,7>, RHS
+ 2985873506U, // <7,7,6,0>: Cost 3 vzipr <5,4,7,6>, <5,6,7,0>
+ 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+ 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+ 2985873104U, // <7,7,6,3>: Cost 3 vzipr <5,4,7,6>, <5,1,7,3>
+ 2985873510U, // <7,7,6,4>: Cost 3 vzipr <5,4,7,6>, <5,6,7,4>
+ 2985873511U, // <7,7,6,5>: Cost 3 vzipr <5,4,7,6>, <5,6,7,5>
+ 1912131526U, // <7,7,6,6>: Cost 2 vzipr <5,4,7,6>, <5,4,7,6>
+ 2133295104U, // <7,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0
+ 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+ 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+ 2127405059U, // <7,7,7,1>: Cost 2 ins <7,7,7,u>, lane 3
+ 2127405059U, // <7,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3
+ 2127405059U, // <7,7,7,3>: Cost 2 ins <7,7,7,u>, lane 3
+ 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+ 2127405059U, // <7,7,7,5>: Cost 2 ins <7,7,7,u>, lane 3
+ 2127405059U, // <7,7,7,6>: Cost 2 ins <7,7,7,u>, lane 3
+ 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+ 1913409634U, // <7,7,u,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0>
+ 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+ 1724749614U, // <7,7,u,2>: Cost 2 vuzpl <7,7,7,7>, LHS
+ 1779761821U, // <7,7,u,3>: Cost 2 vuzpr <5,7,5,7>, LHS
+ 1913442406U, // <7,7,u,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4>
+ 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+ 1724749978U, // <7,7,u,6>: Cost 2 vuzpl <7,7,7,7>, RHS
+ 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+ 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+ 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+ 1720131686U, // <7,u,0,2>: Cost 2 vuzpl <7,0,u,2>, LHS
+ 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+ 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+ 1853839514U, // <7,u,0,5>: Cost 2 vzipl <7,0,1,2>, RHS
+ 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+ 1913408840U, // <7,u,0,7>: Cost 2 vzipr <5,6,7,0>, RHS
+ 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+ 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+ 2128232448U, // <7,u,1,1>: Cost 2 ins <u,0,1,1>, lane 0
+ 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+ 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+ 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+ 2122317827U, // <7,u,1,5>: Cost 2 ins <7,0,1,u>, lane 3
+ 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+ 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+ 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+ 1662211948U, // <7,u,2,0>: Cost 2 vext3 RHS, <u,2,0,2>
+ 2128969728U, // <7,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0
+ 2128314368U, // <7,u,2,2>: Cost 2 ins <u,0,2,2>, lane 0
+ 1055244288U, // <7,u,2,3>: Cost 1 ins LHS, lane 0
+ 1662211988U, // <7,u,2,4>: Cost 2 vext3 RHS, <u,2,4,6>
+ 2129002496U, // <7,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0
+ 2131001344U, // <7,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0
+ 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+ 1055244288U, // <7,u,2,u>: Cost 1 ins LHS, lane 0
+ 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+ 1638324167U, // <7,u,3,1>: Cost 2 vext3 RHS, <u,3,1,3>
+ 2128388096U, // <7,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0
+ 1910112412U, // <7,u,3,3>: Cost 2 vzipr <5,1,7,3>, LHS
+ 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+ 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+ 2125471746U, // <7,u,3,6>: Cost 2 ins <7,4,u,6>, lane 2
+ 1910115656U, // <7,u,3,7>: Cost 2 vzipr <5,1,7,3>, RHS
+ 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+ 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+ 1856821038U, // <7,u,4,1>: Cost 2 vzipl <7,4,5,6>, LHS
+ 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+ 1913438364U, // <7,u,4,3>: Cost 2 vzipr <5,6,7,4>, LHS
+ 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+ 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+ 1720134966U, // <7,u,4,6>: Cost 2 vuzpl <7,0,u,2>, RHS
+ 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+ 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+ 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+ 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+ 1991038766U, // <7,u,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS
+ 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+ 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+ 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+ 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+ 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+ 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+ 1662359728U, // <7,u,6,0>: Cost 2 vext3 RHS, <u,6,0,2>
+ 2131918848U, // <7,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0
+ 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+ 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+ 1662359768U, // <7,u,6,4>: Cost 2 vext3 RHS, <u,6,4,6>
+ 2131951616U, // <7,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0
+ 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+ 1058226176U, // <7,u,6,7>: Cost 1 ins RHS, lane 0
+ 1058226176U, // <7,u,6,u>: Cost 1 ins RHS, lane 0
+ 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+ 1640462603U, // <7,u,7,1>: Cost 2 vext3 RHS, <u,7,1,3>
+ 1993185070U, // <7,u,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS
+ 1912799388U, // <7,u,7,3>: Cost 2 vzipr <5,5,7,7>, LHS
+ 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+ 1640462643U, // <7,u,7,5>: Cost 2 vext3 RHS, <u,7,5,7>
+ 1993185434U, // <7,u,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS
+ 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+ 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+ 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+ 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+ 1055244288U, // <7,u,u,3>: Cost 1 ins LHS, lane 0
+ 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+ 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+ 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+ 1058226176U, // <7,u,u,7>: Cost 1 ins RHS, lane 0
+ 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+ 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+ 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+ 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+ 2085707777U, // <u,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+ 2080440323U, // <u,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3
+ 2080440323U, // <u,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3
+ 2080440323U, // <u,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3
+ 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+ 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+ 786808934U, // <u,0,1,1>: Cost 1 vzipl LHS, LHS
+ 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+ 1756332134U, // <u,0,1,3>: Cost 2 vuzpr <1,u,3,0>, LHS
+ 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+ 2085797889U, // <u,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1
+ 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+ 2080514051U, // <u,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3
+ 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+ 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+ 1994768394U, // <u,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1>
+ 921026662U, // <u,0,2,2>: Cost 1 vtrnl LHS, LHS
+ 1012113409U, // <u,0,2,3>: Cost 1 ins LHS, lane 1
+ 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+ 2080587779U, // <u,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3
+ 2085879809U, // <u,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2080587779U, // <u,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3
+ 921026716U, // <u,0,2,u>: Cost 1 vtrnl LHS, LHS
+ 1880326144U, // <u,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0>
+ 1880327846U, // <u,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1>
+ 72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+ 2091900929U, // <u,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2091909121U, // <u,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 2086633475U, // <u,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3
+ 2086633475U, // <u,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3
+ 2091933697U, // <u,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+ 1705610572U, // <u,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2>
+ 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+ 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+ 2086002689U, // <u,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 1947828428U, // <u,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6>
+ 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1726844214U, // <u,0,4,6>: Cost 2 vuzpl <u,2,0,2>, RHS
+ 2109923329U, // <u,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+ 1863532544U, // <u,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0>
+ 789790822U, // <u,0,5,1>: Cost 1 vzipl RHS, LHS
+ 1996349542U, // <u,0,5,2>: Cost 2 vtrnl <u,3,5,7>, LHS
+ 2104696835U, // <u,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3
+ 1863532882U, // <u,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5>
+ 2109980673U, // <u,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1
+ 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+ 1756335414U, // <u,0,5,7>: Cost 2 vuzpr <1,u,3,0>, RHS
+ 789791389U, // <u,0,5,u>: Cost 1 vzipl RHS, LHS
+ 1997750272U, // <u,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0>
+ 1997750282U, // <u,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1>
+ 924008550U, // <u,0,6,2>: Cost 1 vtrnl RHS, LHS
+ 2104770563U, // <u,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3
+ 1146503858U, // <u,0,6,4>: Cost 2 vrev <0,u,4,6>
+ 2104770563U, // <u,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3
+ 2110062593U, // <u,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1
+ 1036328961U, // <u,0,6,7>: Cost 1 ins RHS, lane 1
+ 924008604U, // <u,0,6,u>: Cost 1 vtrnl RHS, LHS
+ 1906900992U, // <u,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0>
+ 1906902694U, // <u,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1>
+ 1906901156U, // <u,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2>
+ 2116083713U, // <u,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+ 2116091905U, // <u,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1
+ 2980643874U, // <u,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5>
+ 2116108289U, // <u,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1
+ 2116116481U, // <u,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 1906901162U, // <u,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u>
+ 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+ 791453798U, // <u,0,u,1>: Cost 1 vzipl LHS, LHS
+ 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+ 1012113409U, // <u,0,u,3>: Cost 1 ins LHS, lane 1
+ 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+ 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+ 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+ 1036328961U, // <u,0,u,7>: Cost 1 ins RHS, lane 1
+ 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+ 1818149622U, // <u,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2>
+ 1007951877U, // <u,1,0,1>: Cost 1 ins LHS, lane 5
+ 1725587558U, // <u,1,0,2>: Cost 2 vuzpl <u,0,1,2>, LHS
+ 1007910914U, // <u,1,0,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <u,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 2081669122U, // <u,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2
+ 2081677314U, // <u,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2
+ 2081685506U, // <u,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007951877U, // <u,1,0,u>: Cost 1 ins LHS, lane 5
+ 1481786002U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, <0,u,1,1>
+ 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+ 1860551574U, // <u,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0>
+ 1007910914U, // <u,1,1,3>: Cost 1 ins LHS, lane 2
+ 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+ 1860551824U, // <u,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7>
+ 2081677314U, // <u,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2
+ 2081685506U, // <u,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007910914U, // <u,1,1,u>: Cost 1 ins LHS, lane 2
+ 1007509507U, // <u,1,2,0>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,2,1>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,2,2>: Cost 1 ins LHS, lane 3
+ 835584U, // <u,1,2,3>: Cost 0 copy LHS
+ 1007509507U, // <u,1,2,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,2,5>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,2,6>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,2,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <u,1,2,u>: Cost 0 copy LHS
+ 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+ 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+ 1880328342U, // <u,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2>
+ 945004646U, // <u,1,3,3>: Cost 1 vtrnr LHS, LHS
+ 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+ 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+ 2087297027U, // <u,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3
+ 2133737476U, // <u,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4
+ 945004651U, // <u,1,3,u>: Cost 1 vtrnr LHS, LHS
+ 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+ 2081636354U, // <u,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2
+ 2081644546U, // <u,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2
+ 1007910914U, // <u,1,4,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <u,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 1007951877U, // <u,1,4,5>: Cost 1 ins LHS, lane 5
+ 1725590838U, // <u,1,4,6>: Cost 2 vuzpl <u,0,1,2>, RHS
+ 2081685506U, // <u,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2
+ 1007910914U, // <u,1,4,u>: Cost 1 ins LHS, lane 2
+ 1481818774U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, <0,u,1,5>
+ 1863533364U, // <u,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1>
+ 1863533462U, // <u,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0>
+ 1007910914U, // <u,1,5,3>: Cost 1 ins LHS, lane 2
+ 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+ 1863533712U, // <u,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7>
+ 2133876740U, // <u,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4
+ 1750224182U, // <u,1,5,7>: Cost 2 vuzpr <0,u,1,1>, RHS
+ 1007910914U, // <u,1,5,u>: Cost 1 ins LHS, lane 2
+ 2081628162U, // <u,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2
+ 1997751092U, // <u,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1>
+ 2133917700U, // <u,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4
+ 1007910914U, // <u,1,6,3>: Cost 1 ins LHS, lane 2
+ 2081660930U, // <u,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2
+ 1997751296U, // <u,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7>
+ 2133950468U, // <u,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4
+ 1060216836U, // <u,1,6,7>: Cost 1 ins RHS, lane 4
+ 1007910914U, // <u,1,6,u>: Cost 1 ins LHS, lane 2
+ 2133975044U, // <u,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4
+ 1906901002U, // <u,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1>
+ 1906903190U, // <u,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2>
+ 969220198U, // <u,1,7,3>: Cost 1 vtrnr RHS, LHS
+ 2134007812U, // <u,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4
+ 1152558485U, // <u,1,7,5>: Cost 2 vrev <1,u,5,7>
+ 2134024196U, // <u,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 2134032388U, // <u,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4
+ 969220203U, // <u,1,7,u>: Cost 1 vtrnr RHS, LHS
+ 1007509507U, // <u,1,u,0>: Cost 1 ins LHS, lane 3
+ 1007951877U, // <u,1,u,1>: Cost 1 ins LHS, lane 5
+ 1007509507U, // <u,1,u,2>: Cost 1 ins LHS, lane 3
+ 835584U, // <u,1,u,3>: Cost 0 copy LHS
+ 1007509507U, // <u,1,u,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,u,5>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,u,6>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,1,u,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <u,1,u,u>: Cost 0 copy LHS
+ 1726332928U, // <u,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0>
+ 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 652591206U, // <u,2,0,2>: Cost 1 vuzpl LHS, LHS
+ 1886937190U, // <u,2,0,3>: Cost 2 vzipr <1,2,u,0>, LHS
+ 1726333132U, // <u,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6>
+ 2081767427U, // <u,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3
+ 2082340866U, // <u,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2
+ 2081767427U, // <u,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3
+ 652591260U, // <u,2,0,u>: Cost 1 vuzpl LHS, LHS
+ 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+ 1726333748U, // <u,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1>
+ 1860552296U, // <u,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2>
+ 1750155366U, // <u,2,1,3>: Cost 2 vuzpr <0,u,0,2>, LHS
+ 2088296450U, // <u,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2
+ 1726333952U, // <u,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7>
+ 1860552634U, // <u,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7>
+ 2109702145U, // <u,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1
+ 1750155371U, // <u,2,1,u>: Cost 2 vuzpr <0,u,0,2>, LHS
+ 1481867932U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, <0,u,2,2>
+ 2085838849U, // <u,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+ 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+ 1012113409U, // <u,2,2,3>: Cost 1 ins LHS, lane 1
+ 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+ 2085871617U, // <u,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+ 2085879809U, // <u,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 2085888001U, // <u,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1
+ 1012113409U, // <u,2,2,u>: Cost 1 ins LHS, lane 1
+ 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+ 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+ 1880326164U, // <u,2,3,2>: Cost 2 vzipr LHS, <0,0,2,2>
+ 806584422U, // <u,2,3,3>: Cost 1 vzipr LHS, LHS
+ 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+ 1726335490U, // <u,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6>
+ 1880326492U, // <u,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6>
+ 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+ 806584427U, // <u,2,3,u>: Cost 1 vzipr LHS, LHS
+ 1726336332U, // <u,2,4,0>: Cost 2 vuzpl LHS, <4,6,0,2>
+ 2082062339U, // <u,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3
+ 2082308098U, // <u,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2
+ 1886969958U, // <u,2,4,3>: Cost 2 vzipr <1,2,u,4>, LHS
+ 1726336208U, // <u,2,4,4>: Cost 2 vuzpl LHS, <4,4,4,4>
+ 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 652594486U, // <u,2,4,6>: Cost 1 vuzpl LHS, RHS
+ 2082062339U, // <u,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3
+ 652594504U, // <u,2,4,u>: Cost 1 vuzpl LHS, RHS
+ 2088263682U, // <u,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2
+ 1726337152U, // <u,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3>
+ 1863534184U, // <u,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2>
+ 1884987494U, // <u,2,5,3>: Cost 2 vzipr <0,u,u,5>, LHS
+ 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+ 1726337028U, // <u,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5>
+ 1863534522U, // <u,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7>
+ 1750158646U, // <u,2,5,7>: Cost 2 vuzpr <0,u,0,2>, RHS
+ 1750158647U, // <u,2,5,u>: Cost 2 vuzpr <0,u,0,2>, RHS
+ 1481900704U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, <0,u,2,6>
+ 2110021633U, // <u,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+ 1997751912U, // <u,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2>
+ 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+ 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+ 2110054401U, // <u,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+ 1726337848U, // <u,2,6,6>: Cost 2 vuzpl LHS, <6,6,6,6>
+ 1036328961U, // <u,2,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <u,2,6,u>: Cost 1 ins RHS, lane 1
+ 2042962838U, // <u,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0>
+ 1726338042U, // <u,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2>
+ 1906901012U, // <u,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2>
+ 833159270U, // <u,2,7,3>: Cost 1 vzipr RHS, LHS
+ 2042962842U, // <u,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4>
+ 1726338406U, // <u,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6>
+ 1906901340U, // <u,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6>
+ 1726338668U, // <u,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7>
+ 833159275U, // <u,2,7,u>: Cost 1 vzipr RHS, LHS
+ 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+ 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+ 652597038U, // <u,2,u,2>: Cost 1 vuzpl LHS, LHS
+ 806625382U, // <u,2,u,3>: Cost 1 vzipr LHS, LHS
+ 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+ 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+ 652597402U, // <u,2,u,6>: Cost 1 vuzpl LHS, RHS
+ 1036328961U, // <u,2,u,7>: Cost 1 ins RHS, lane 1
+ 806625387U, // <u,2,u,u>: Cost 1 vzipr LHS, LHS
+ 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+ 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+ 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+ 2088951810U, // <u,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 2094940162U, // <u,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2
+ 2094374915U, // <u,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3
+ 2088984578U, // <u,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+ 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+ 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+ 676569190U, // <u,3,1,3>: Cost 1 vuzpr LHS, LHS
+ 1860553218U, // <u,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6>
+ 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+ 2088476675U, // <u,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3
+ 2088984578U, // <u,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 676569195U, // <u,3,1,u>: Cost 1 vuzpr LHS, LHS
+ 1750311830U, // <u,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0>
+ 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+ 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+ 1012113409U, // <u,3,2,3>: Cost 1 ins LHS, lane 1
+ 1750311834U, // <u,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4>
+ 1994770946U, // <u,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6>
+ 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+ 2088984578U, // <u,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 1012113409U, // <u,3,2,u>: Cost 1 ins LHS, lane 1
+ 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+ 1750312614U, // <u,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1>
+ 1880326902U, // <u,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2>
+ 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+ 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+ 1750312654U, // <u,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5>
+ 2100568067U, // <u,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3
+ 1880327312U, // <u,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7>
+ 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+ 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+ 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+ 2094669827U, // <u,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3
+ 2088951810U, // <u,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2
+ 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+ 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+ 1750311260U, // <u,3,4,6>: Cost 2 vuzpr LHS, <0,4,2,6>
+ 2088984578U, // <u,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2
+ 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+ 1863534742U, // <u,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2>
+ 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+ 2088771587U, // <u,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3
+ 1863535004U, // <u,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3>
+ 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+ 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+ 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+ 676572470U, // <u,3,5,7>: Cost 1 vuzpr LHS, RHS
+ 676572471U, // <u,3,5,u>: Cost 1 vuzpr LHS, RHS
+ 1798090850U, // <u,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0>
+ 1997752470U, // <u,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2>
+ 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+ 1997752732U, // <u,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3>
+ 1798090854U, // <u,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4>
+ 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+ 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+ 1060216836U, // <u,3,6,7>: Cost 1 ins RHS, lane 4
+ 1060216836U, // <u,3,6,u>: Cost 1 ins RHS, lane 4
+ 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+ 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+ 1906901832U, // <u,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3>
+ 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+ 2042963662U, // <u,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5>
+ 2134024196U, // <u,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4
+ 1906902160U, // <u,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7>
+ 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+ 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+ 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+ 1880367862U, // <u,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2>
+ 676569757U, // <u,3,u,3>: Cost 1 vuzpr LHS, LHS
+ 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+ 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+ 1750311584U, // <u,3,u,6>: Cost 2 vuzpr LHS, <0,u,2,6>
+ 676572713U, // <u,3,u,7>: Cost 1 vuzpr LHS, RHS
+ 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+ 1974046028U, // <u,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2>
+ 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1727168614U, // <u,4,0,2>: Cost 2 vuzpl <u,2,4,6>, LHS
+ 2085707777U, // <u,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 1679392972U, // <u,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6>
+ 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+ 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+ 2109628417U, // <u,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1
+ 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1860553618U, // <u,4,1,0>: Cost 2 vzipl LHS, <4,0,5,1>
+ 2085765121U, // <u,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1
+ 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+ 1756364902U, // <u,4,1,3>: Cost 2 vuzpr <1,u,3,4>, LHS
+ 1860553936U, // <u,4,1,4>: Cost 2 vzipl LHS, <4,4,4,4>
+ 786812214U, // <u,4,1,5>: Cost 1 vzipl LHS, RHS
+ 1994026294U, // <u,4,1,6>: Cost 2 vtrnl <u,0,1,2>, RHS
+ 2083168259U, // <u,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3
+ 786812457U, // <u,4,1,u>: Cost 1 vzipl LHS, RHS
+ 1170066926U, // <u,4,2,0>: Cost 2 vrev <4,u,0,2>
+ 2083241987U, // <u,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3
+ 2085847041U, // <u,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <u,4,2,3>: Cost 1 ins LHS, lane 1
+ 1994771664U, // <u,4,2,4>: Cost 2 vtrnl LHS, <4,4,4,4>
+ 1994771346U, // <u,4,2,5>: Cost 2 vtrnl LHS, <4,0,5,1>
+ 921029942U, // <u,4,2,6>: Cost 1 vtrnl LHS, RHS
+ 2083241987U, // <u,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3
+ 921029960U, // <u,4,2,u>: Cost 1 vtrnl LHS, RHS
+ 2091876353U, // <u,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 2954070192U, // <u,4,3,1>: Cost 3 vzipr LHS, <3,0,4,1>
+ 2091892737U, // <u,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2091900929U, // <u,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 1928105168U, // <u,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4>
+ 1880327886U, // <u,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5>
+ 1880326348U, // <u,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6>
+ 2091933697U, // <u,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1
+ 1880326350U, // <u,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u>
+ 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+ 2107277315U, // <u,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3
+ 2107277315U, // <u,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3
+ 2086002689U, // <u,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+ 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+ 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+ 2109923329U, // <u,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1
+ 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+ 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+ 2101379075U, // <u,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3
+ 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+ 2101379075U, // <u,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3
+ 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+ 789794102U, // <u,4,5,5>: Cost 1 vzipl RHS, RHS
+ 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+ 1756368182U, // <u,4,5,7>: Cost 2 vuzpr <1,u,3,4>, RHS
+ 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+ 1482048178U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, <0,u,4,6>
+ 2107424771U, // <u,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3
+ 2110029825U, // <u,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2107424771U, // <u,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3
+ 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+ 1997753234U, // <u,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1>
+ 924011830U, // <u,4,6,6>: Cost 1 vtrnl RHS, RHS
+ 1036328961U, // <u,4,6,7>: Cost 1 ins RHS, lane 1
+ 924011848U, // <u,4,6,u>: Cost 1 vtrnl RHS, RHS
+ 2116059137U, // <u,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1
+ 2113470467U, // <u,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3
+ 2113470467U, // <u,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3
+ 2116083713U, // <u,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1
+ 1906904272U, // <u,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4>
+ 1906902734U, // <u,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5>
+ 96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+ 2116116481U, // <u,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1
+ 96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+ 1482064564U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, <0,u,4,u>
+ 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+ 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+ 1012113409U, // <u,4,u,3>: Cost 1 ins LHS, lane 1
+ 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+ 791457078U, // <u,4,u,5>: Cost 1 vzipl LHS, RHS
+ 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+ 1036328961U, // <u,4,u,7>: Cost 1 ins RHS, lane 1
+ 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+ 2085683201U, // <u,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1
+ 1034493957U, // <u,5,0,1>: Cost 1 ins RHS, lane 5
+ 1727914086U, // <u,5,0,2>: Cost 2 vuzpl <u,3,5,7>, LHS
+ 2085707777U, // <u,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1
+ 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+ 1678778497U, // <u,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3>
+ 2108219394U, // <u,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2
+ 1034485762U, // <u,5,0,7>: Cost 1 ins RHS, lane 2
+ 1034493957U, // <u,5,0,u>: Cost 1 ins RHS, lane 5
+ 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+ 1860554448U, // <u,5,1,1>: Cost 2 vzipl LHS, <5,1,7,3>
+ 2103689217U, // <u,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1
+ 1750253670U, // <u,5,1,3>: Cost 2 vuzpr <0,u,1,5>, LHS
+ 1505971738U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, <4,u,5,1>
+ 1860554756U, // <u,5,1,5>: Cost 2 vzipl LHS, <5,5,5,5>
+ 1860554850U, // <u,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0>
+ 1034485762U, // <u,5,1,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <u,5,1,u>: Cost 1 ins RHS, lane 2
+ 2085830657U, // <u,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1
+ 1994772608U, // <u,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3>
+ 2085847041U, // <u,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1
+ 1012113409U, // <u,5,2,3>: Cost 1 ins LHS, lane 1
+ 2085863425U, // <u,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1
+ 1994772484U, // <u,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5>
+ 2085879809U, // <u,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1
+ 1034485762U, // <u,5,2,7>: Cost 1 ins RHS, lane 2
+ 1012113409U, // <u,5,2,u>: Cost 1 ins LHS, lane 1
+ 2091876353U, // <u,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1
+ 1176121553U, // <u,5,3,1>: Cost 2 vrev <5,u,1,3>
+ 2091892737U, // <u,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 2091900929U, // <u,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1
+ 2091909121U, // <u,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1
+ 1928105178U, // <u,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5>
+ 1880328706U, // <u,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6>
+ 945007926U, // <u,5,3,7>: Cost 1 vtrnr LHS, RHS
+ 945007927U, // <u,5,3,u>: Cost 1 vtrnr LHS, RHS
+ 2108170242U, // <u,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2
+ 2108178434U, // <u,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2
+ 2108186626U, // <u,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2
+ 2086002689U, // <u,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1
+ 1845022662U, // <u,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6>
+ 1034493957U, // <u,5,4,5>: Cost 1 ins RHS, lane 5
+ 1727917366U, // <u,5,4,6>: Cost 2 vuzpl <u,3,5,7>, RHS
+ 1034485762U, // <u,5,4,7>: Cost 1 ins RHS, lane 2
+ 1034493957U, // <u,5,4,u>: Cost 1 ins RHS, lane 5
+ 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+ 1863536336U, // <u,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3>
+ 2108186626U, // <u,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2
+ 2086076417U, // <u,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 1506004510U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, <4,u,5,5>
+ 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+ 1863536738U, // <u,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0>
+ 1034485762U, // <u,5,5,7>: Cost 1 ins RHS, lane 2
+ 1034485762U, // <u,5,5,u>: Cost 1 ins RHS, lane 2
+ 1034346499U, // <u,5,6,0>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,1>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,2>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,3>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,4>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,5>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,5,6,6>: Cost 1 ins RHS, lane 3
+ 27705344U, // <u,5,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,6,u>: Cost 0 copy RHS
+ 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+ 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+ 2114134019U, // <u,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3
+ 2133999620U, // <u,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4
+ 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+ 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+ 1906903554U, // <u,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6>
+ 969223478U, // <u,5,7,7>: Cost 1 vtrnr RHS, RHS
+ 969223479U, // <u,5,7,u>: Cost 1 vtrnr RHS, RHS
+ 1034346499U, // <u,5,u,0>: Cost 1 ins RHS, lane 3
+ 1034493957U, // <u,5,u,1>: Cost 1 ins RHS, lane 5
+ 1034346499U, // <u,5,u,2>: Cost 1 ins RHS, lane 3
+ 1012113409U, // <u,5,u,3>: Cost 1 ins LHS, lane 1
+ 1034346499U, // <u,5,u,4>: Cost 1 ins RHS, lane 3
+ 1034493957U, // <u,5,u,5>: Cost 1 ins RHS, lane 5
+ 1034346499U, // <u,5,u,6>: Cost 1 ins RHS, lane 3
+ 27705344U, // <u,5,u,7>: Cost 0 copy RHS
+ 27705344U, // <u,5,u,u>: Cost 0 copy RHS
+ 1729314816U, // <u,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0>
+ 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 655573094U, // <u,6,0,2>: Cost 1 vuzpl RHS, LHS
+ 2108309507U, // <u,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3
+ 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+ 2108309507U, // <u,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3
+ 2108882946U, // <u,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2
+ 1886940470U, // <u,6,0,7>: Cost 2 vzipr <1,2,u,0>, RHS
+ 655573148U, // <u,6,0,u>: Cost 1 vuzpl RHS, LHS
+ 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+ 1729315636U, // <u,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1>
+ 1860555258U, // <u,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3>
+ 1750335590U, // <u,6,1,3>: Cost 2 vuzpr <0,u,2,6>, LHS
+ 2114838530U, // <u,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2
+ 1729315840U, // <u,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7>
+ 1860555576U, // <u,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6>
+ 1884958006U, // <u,6,1,7>: Cost 2 vzipr <0,u,u,1>, RHS
+ 1750335595U, // <u,6,1,u>: Cost 2 vuzpr <0,u,2,6>, LHS
+ 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+ 2085838849U, // <u,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1
+ 1729316456U, // <u,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2>
+ 1012113409U, // <u,6,2,3>: Cost 1 ins LHS, lane 1
+ 1506053668U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, <4,u,6,2>
+ 2085871617U, // <u,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1
+ 1994773304U, // <u,6,2,6>: Cost 2 vtrnl LHS, <6,6,6,6>
+ 1880984886U, // <u,6,2,7>: Cost 2 vzipr <0,2,u,2>, RHS
+ 1012113409U, // <u,6,2,u>: Cost 1 ins LHS, lane 1
+ 2066526306U, // <u,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0>
+ 1729317014U, // <u,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2>
+ 1928104860U, // <u,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2>
+ 1729317276U, // <u,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3>
+ 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+ 1729317378U, // <u,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6>
+ 1928105188U, // <u,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6>
+ 806587702U, // <u,6,3,7>: Cost 1 vzipr LHS, RHS
+ 806587703U, // <u,6,3,u>: Cost 1 vzipr LHS, RHS
+ 1729318220U, // <u,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2>
+ 2108604419U, // <u,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3
+ 2108850178U, // <u,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2
+ 2108604419U, // <u,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3
+ 1729318096U, // <u,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4>
+ 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 655576374U, // <u,6,4,6>: Cost 1 vuzpl RHS, RHS
+ 1886973238U, // <u,6,4,7>: Cost 2 vzipr <1,2,u,4>, RHS
+ 655576392U, // <u,6,4,u>: Cost 1 vuzpl RHS, RHS
+ 2114805762U, // <u,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2
+ 1729319040U, // <u,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3>
+ 1863537146U, // <u,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3>
+ 2086076417U, // <u,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1
+ 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+ 1729318916U, // <u,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5>
+ 1863537464U, // <u,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6>
+ 1750338870U, // <u,6,5,7>: Cost 2 vuzpr <0,u,2,6>, RHS
+ 1750338871U, // <u,6,5,u>: Cost 2 vuzpr <0,u,2,6>, RHS
+ 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+ 2110021633U, // <u,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1
+ 2110029825U, // <u,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1
+ 2086150145U, // <u,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1
+ 1506086440U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, <4,u,6,6>
+ 2110054401U, // <u,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1
+ 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+ 1036328961U, // <u,6,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <u,6,6,u>: Cost 1 ins RHS, lane 1
+ 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+ 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+ 1906903964U, // <u,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2>
+ 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+ 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+ 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+ 1906904292U, // <u,6,7,6>: Cost 2 vzipr RHS, <4,4,6,6>
+ 833162550U, // <u,6,7,7>: Cost 1 vzipr RHS, RHS
+ 833162551U, // <u,6,7,u>: Cost 1 vzipr RHS, RHS
+ 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+ 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+ 655578926U, // <u,6,u,2>: Cost 1 vuzpl RHS, LHS
+ 1012113409U, // <u,6,u,3>: Cost 1 ins LHS, lane 1
+ 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+ 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+ 655579290U, // <u,6,u,6>: Cost 1 vuzpl RHS, RHS
+ 806628662U, // <u,6,u,7>: Cost 1 vzipr LHS, RHS
+ 806628663U, // <u,6,u,u>: Cost 1 vzipr LHS, RHS
+ 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+ 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+ 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+ 2115493890U, // <u,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+ 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+ 2120916995U, // <u,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3
+ 2115526658U, // <u,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+ 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+ 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+ 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+ 700784742U, // <u,7,1,3>: Cost 1 vuzpr RHS, LHS
+ 1860556134U, // <u,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6>
+ 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+ 2115018755U, // <u,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3
+ 1860556396U, // <u,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7>
+ 700784747U, // <u,7,1,u>: Cost 1 vuzpr RHS, LHS
+ 1774527382U, // <u,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0>
+ 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+ 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+ 1012113409U, // <u,7,2,3>: Cost 1 ins LHS, lane 1
+ 1774527386U, // <u,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4>
+ 1994773862U, // <u,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6>
+ 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+ 1994774124U, // <u,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7>
+ 1012113409U, // <u,7,2,u>: Cost 1 ins LHS, lane 1
+ 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+ 1774528166U, // <u,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1>
+ 2091892737U, // <u,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1
+ 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+ 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+ 1774528206U, // <u,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5>
+ 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+ 1774527488U, // <u,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7>
+ 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+ 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+ 2121449474U, // <u,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2
+ 2121211907U, // <u,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3
+ 2115493890U, // <u,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+ 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+ 1571360076U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2>
+ 2115526658U, // <u,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2
+ 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+ 1863537658U, // <u,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2>
+ 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+ 2115313667U, // <u,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3
+ 2115493890U, // <u,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+ 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+ 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+ 700788022U, // <u,7,5,7>: Cost 1 vuzpr RHS, RHS
+ 700788023U, // <u,7,5,u>: Cost 1 vuzpr RHS, RHS
+ 1774530658U, // <u,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0>
+ 1997755386U, // <u,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2>
+ 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+ 2115493890U, // <u,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2
+ 1774530662U, // <u,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4>
+ 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+ 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+ 1036328961U, // <u,7,6,7>: Cost 1 ins RHS, lane 1
+ 1036328961U, // <u,7,6,u>: Cost 1 ins RHS, lane 1
+ 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+ 1774531406U, // <u,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1>
+ 2127405059U, // <u,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3
+ 1906904784U, // <u,7,7,3>: Cost 2 vzipr RHS, <5,1,7,3>
+ 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+ 1774531446U, // <u,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5>
+ 1906905030U, // <u,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6>
+ 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+ 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+ 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+ 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+ 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+ 700785309U, // <u,7,u,3>: Cost 1 vuzpr RHS, LHS
+ 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+ 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+ 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+ 700788265U, // <u,7,u,7>: Cost 1 vuzpr RHS, RHS
+ 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+ 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+ 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+ 653033574U, // <u,u,0,2>: Cost 1 vuzpl LHS, LHS
+ 1007910914U, // <u,u,0,3>: Cost 1 ins LHS, lane 2
+ 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+ 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+ 1995282586U, // <u,u,0,6>: Cost 2 vtrnl <u,2,0,2>, RHS
+ 1034485762U, // <u,u,0,7>: Cost 1 ins RHS, lane 2
+ 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+ 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+ 786814766U, // <u,u,1,1>: Cost 1 vzipl LHS, LHS
+ 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+ 676610150U, // <u,u,1,3>: Cost 1 vuzpr LHS, LHS
+ 1482304822U, // <u,u,1,4>: Cost 2 vext1 <0,u,u,1>, RHS
+ 786815130U, // <u,u,1,5>: Cost 1 vzipl LHS, RHS
+ 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+ 1034485762U, // <u,u,1,7>: Cost 1 ins RHS, lane 2
+ 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+ 1007509507U, // <u,u,2,0>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,u,2,1>: Cost 1 ins LHS, lane 3
+ 921032494U, // <u,u,2,2>: Cost 1 vtrnl LHS, LHS
+ 835584U, // <u,u,2,3>: Cost 0 copy LHS
+ 1007509507U, // <u,u,2,4>: Cost 1 ins LHS, lane 3
+ 1007509507U, // <u,u,2,5>: Cost 1 ins LHS, lane 3
+ 921032858U, // <u,u,2,6>: Cost 1 vtrnl LHS, RHS
+ 1007509507U, // <u,u,2,7>: Cost 1 ins LHS, lane 3
+ 835584U, // <u,u,2,u>: Cost 0 copy LHS
+ 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+ 1880327918U, // <u,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1>
+ 120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+ 806584476U, // <u,u,3,3>: Cost 1 vzipr LHS, LHS
+ 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+ 1880327922U, // <u,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5>
+ 1880326384U, // <u,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6>
+ 806587720U, // <u,u,3,7>: Cost 1 vzipr LHS, RHS
+ 806584481U, // <u,u,3,u>: Cost 1 vzipr LHS, LHS
+ 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+ 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+ 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+ 1007910914U, // <u,u,4,3>: Cost 1 ins LHS, lane 2
+ 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+ 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+ 653036854U, // <u,u,4,6>: Cost 1 vuzpl LHS, RHS
+ 1034485762U, // <u,u,4,7>: Cost 1 ins RHS, lane 2
+ 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+ 1482334933U, // <u,u,5,0>: Cost 2 vext1 <0,u,u,5>, <0,u,u,5>
+ 789796654U, // <u,u,5,1>: Cost 1 vzipl RHS, LHS
+ 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+ 1007910914U, // <u,u,5,3>: Cost 1 ins LHS, lane 2
+ 1482337590U, // <u,u,5,4>: Cost 2 vext1 <0,u,u,5>, RHS
+ 789797018U, // <u,u,5,5>: Cost 1 vzipl RHS, RHS
+ 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+ 676613430U, // <u,u,5,7>: Cost 1 vuzpr LHS, RHS
+ 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+ 1034346499U, // <u,u,6,0>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,u,6,1>: Cost 1 ins RHS, lane 3
+ 924014382U, // <u,u,6,2>: Cost 1 vtrnl RHS, LHS
+ 1007910914U, // <u,u,6,3>: Cost 1 ins LHS, lane 2
+ 1034346499U, // <u,u,6,4>: Cost 1 ins RHS, lane 3
+ 1034346499U, // <u,u,6,5>: Cost 1 ins RHS, lane 3
+ 924014746U, // <u,u,6,6>: Cost 1 vtrnl RHS, RHS
+ 27705344U, // <u,u,6,7>: Cost 0 copy RHS
+ 27705344U, // <u,u,6,u>: Cost 0 copy RHS
+ 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+ 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+ 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+ 833159324U, // <u,u,7,3>: Cost 1 vzipr RHS, LHS
+ 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+ 1906901393U, // <u,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5>
+ 120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+ 833162568U, // <u,u,7,7>: Cost 1 vzipr RHS, RHS
+ 833159329U, // <u,u,7,u>: Cost 1 vzipr RHS, LHS
+ 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+ 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+ 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+ 835584U, // <u,u,u,3>: Cost 0 copy LHS
+ 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+ 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+ 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+ 27705344U, // <u,u,u,7>: Cost 0 copy RHS
+ 835584U, // <u,u,u,u>: Cost 0 copy LHS
+ 0};
+
+static unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
+ assert(M.size() == 4 && "Expected a 4 entry perfect shuffle");
+
+ // Special case zero-cost nop copies, from either LHS or RHS.
+ if (llvm::all_of(llvm::enumerate(M), [](auto &E) {
+ return E.value() < 0 || E.value() == (int)E.index();
+ }))
+ return 0;
+ if (llvm::all_of(llvm::enumerate(M), [](auto &E) {
+ return E.value() < 0 || E.value() == (int)E.index() + 4;
+ }))
+ return 0;
+
+ // Get the four mask elementd from the 2 inputs. Perfect shuffles encode undef
+ // elements with value 8.
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ assert(M[i] < 8 && "Expected a maximum entry of 8 for shuffle mask");
+ if (M[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = M[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ // And extract the cost from the upper bits. The cost is encoded as Cost-1.
+ return (PFEntry >> 30) + 1;
+}
#endif
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index d1b901e58d27..f7c06b9fb71b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -32,6 +33,8 @@
using namespace llvm;
+#define GET_CC_REGISTER_LISTS
+#include "AArch64GenCallingConv.inc"
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"
@@ -63,14 +66,6 @@ bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
return true;
}
-bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) {
- const Function &F = MF->getFunction();
- return isa<ScalableVectorType>(F.getReturnType()) ||
- any_of(F.args(), [](const Argument &Arg) {
- return isa<ScalableVectorType>(Arg.getType());
- });
-}
-
const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
@@ -108,7 +103,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
// This is for OSes other than Windows; Windows is a separate case further
// above.
return CSR_AArch64_AAPCS_X18_SaveList;
- if (hasSVEArgsOrReturn(MF))
+ if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
return CSR_AArch64_SVE_AAPCS_SaveList;
return CSR_AArch64_AAPCS_SaveList;
}
@@ -335,6 +330,13 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
markSuperRegs(Reserved, AArch64::W16);
+ // SME tiles are not allocatable.
+ if (MF.getSubtarget<AArch64Subtarget>().hasSME()) {
+ for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true);
+ SubReg.isValid(); ++SubReg)
+ Reserved.set(*SubReg);
+ }
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
@@ -417,6 +419,68 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return false;
}
+bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
+ MCRegister Reg) const {
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ bool IsVarArg = STI.isCallingConvWin64(MF.getFunction().getCallingConv());
+
+ auto HasReg = [](ArrayRef<MCRegister> RegList, MCRegister Reg) {
+ return llvm::any_of(RegList,
+ [Reg](const MCRegister R) { return R == Reg; });
+ };
+
+ switch (CC) {
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ case CallingConv::WebKit_JS:
+ return HasReg(CC_AArch64_WebKit_JS_ArgRegs, Reg);
+ case CallingConv::GHC:
+ return HasReg(CC_AArch64_GHC_ArgRegs, Reg);
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::PreserveMost:
+ case CallingConv::CXX_FAST_TLS:
+ case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ case CallingConv::Tail:
+ if (STI.isTargetWindows() && IsVarArg)
+ return HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg);
+ if (!STI.isTargetDarwin()) {
+ switch (CC) {
+ default:
+ return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+ case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg) ||
+ HasReg(CC_AArch64_AAPCS_Swift_ArgRegs, Reg);
+ }
+ }
+ if (!IsVarArg) {
+ switch (CC) {
+ default:
+ return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg);
+ case CallingConv::Swift:
+ case CallingConv::SwiftTail:
+ return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg) ||
+ HasReg(CC_AArch64_DarwinPCS_Swift_ArgRegs, Reg);
+ }
+ }
+ if (STI.isTargetILP32())
+ return HasReg(CC_AArch64_DarwinPCS_ILP32_VarArg_ArgRegs, Reg);
+ return HasReg(CC_AArch64_DarwinPCS_VarArg_ArgRegs, Reg);
+ case CallingConv::Win64:
+ if (IsVarArg)
+ HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg);
+ return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+ case CallingConv::CFGuard_Check:
+ return HasReg(CC_AArch64_Win64_CFGuard_Check_ArgRegs, Reg);
+ case CallingConv::AArch64_VectorCall:
+ case CallingConv::AArch64_SVE_VectorCall:
+ return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg);
+ }
+}
+
Register
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -588,23 +652,31 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
// Create a scratch register for the frame index elimination in an instruction.
// This function has special handling of stack tagging loop pseudos, in which
-// case it can also change the instruction opcode (but not the operands).
+// case it can also change the instruction opcode.
static Register
-createScratchRegisterForInstruction(MachineInstr &MI,
+createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum,
const AArch64InstrInfo *TII) {
// ST*Gloop have a reserved scratch register in operand 1. Use it, and also
// replace the instruction with the writeback variant because it will now
// satisfy the operand constraints for it.
- if (MI.getOpcode() == AArch64::STGloop) {
- MI.setDesc(TII->get(AArch64::STGloop_wback));
- return MI.getOperand(1).getReg();
- } else if (MI.getOpcode() == AArch64::STZGloop) {
- MI.setDesc(TII->get(AArch64::STZGloop_wback));
- return MI.getOperand(1).getReg();
+ Register ScratchReg;
+ if (MI.getOpcode() == AArch64::STGloop ||
+ MI.getOpcode() == AArch64::STZGloop) {
+ assert(FIOperandNum == 3 &&
+ "Wrong frame index operand for STGloop/STZGloop");
+ unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback
+ : AArch64::STZGloop_wback;
+ ScratchReg = MI.getOperand(1).getReg();
+ MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true);
+ MI.setDesc(TII->get(Op));
+ MI.tieOperands(1, 3);
} else {
- return MI.getMF()->getRegInfo().createVirtualRegister(
- &AArch64::GPR64RegClass);
+ ScratchReg =
+ MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(ScratchReg, false, false, true);
}
+ return ScratchReg;
}
void AArch64RegisterInfo::getOffsetOpcodes(
@@ -721,9 +793,9 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
- Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
+ Register ScratchReg =
+ createScratchRegisterForInstruction(MI, FIOperandNum, TII);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
- MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
}
unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 0c871ac089a7..12dd70fa4aa8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -42,8 +42,6 @@ public:
void UpdateCustomCallPreservedMask(MachineFunction &MF,
const uint32_t **Mask) const;
- static bool hasSVEArgsOrReturn(const MachineFunction *MF);
-
/// Code Generation virtual methods...
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;
@@ -120,6 +118,9 @@ public:
bool hasBasePointer(const MachineFunction &MF) const;
unsigned getBaseRegister() const;
+ bool isArgumentRegister(const MachineFunction &MF,
+ MCRegister Reg) const override;
+
// Debug information queries.
Register getFrameRegister(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 70daf5abf81d..7a2b165570cb 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -871,7 +871,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
// SVE predicate register classes.
class PPRClass<int lastreg> : RegisterClass<
"AArch64",
- [ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16,
+ [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
(sequence "P%u", 0, lastreg)> {
let Size = 16;
}
@@ -1212,26 +1212,28 @@ let SubRegIndices = [zasubb] in {
// SME Register Classes
-// Accumulator array
-def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
- let Size = 2048;
-}
+let isAllocatable = 0 in {
+ // Accumulator array
+ def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
+ let Size = 2048;
+ }
-// Accumulator array as single tiles
-def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
- let Size = 2048;
-}
-def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
- let Size = 1024;
-}
-def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
- let Size = 512;
-}
-def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
- let Size = 256;
-}
-def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
- let Size = 128;
+ // Accumulator array as single tiles
+ def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
+ let Size = 2048;
+ }
+ def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
+ let Size = 1024;
+ }
+ def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
+ let Size = 512;
+ }
+ def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
+ let Size = 256;
+ }
+ def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
+ let Size = 128;
+ }
}
// SME Register Operands
@@ -1385,3 +1387,12 @@ def svcr_op : Operand<i32> {
return AArch64SVCR::lookupSVCRByEncoding(MCOp.getImm()) != nullptr;
}];
}
+
+//===----------------------------------------------------------------------===//
+// Register categories.
+//
+
+def GeneralPurposeRegisters : RegisterCategory<[GPR64, GPR32]>;
+
+def FIXED_REGS : RegisterClass<"AArch64", [i64], 64, (add FP, SP, VG, FFR)>;
+def FixedRegisters : RegisterCategory<[CCR, FIXED_REGS]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index c4965e7146ff..364ce687fd55 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -360,8 +360,8 @@ AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB,
assert(ImpSPOpIdx != -1);
int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
- BL->RemoveOperand(FirstOpIdxToRemove);
- BL->RemoveOperand(SecondOpIdxToRemove);
+ BL->removeOperand(FirstOpIdxToRemove);
+ BL->removeOperand(SecondOpIdxToRemove);
// Now copy over the implicit operands from the original BLR
BL->copyImplicitOps(MF, BLR);
MF.moveCallSiteInfo(&BLR, BL);
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index aacace64e998..e595d20c8d4e 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -14,9 +14,18 @@
// Add vector elements horizontally or vertically to ZA tile.
//===----------------------------------------------------------------------===//
+def SDT_AArch64RDSVL : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
+def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>;
+
let Predicates = [HasSME] in {
+def RDSVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>;
+def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>;
+def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>;
+
def ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha">;
def ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva">;
+
+def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
}
let Predicates = [HasSMEI64] in {
@@ -29,41 +38,41 @@ let Predicates = [HasSME] in {
// Outer products
//===----------------------------------------------------------------------===//
-defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa">;
-defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops">;
+defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa", int_aarch64_sme_mopa_wide>;
+defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops", int_aarch64_sme_mops_wide>;
-def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">;
-def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">;
+defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>;
}
let Predicates = [HasSMEF64] in {
-def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">;
-def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">;
+defm FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops", int_aarch64_sme_mops>;
}
let Predicates = [HasSME] in {
-defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa">;
-defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops">;
-
-def SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa">;
-def SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops">;
-def UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa">;
-def UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops">;
-def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">;
-def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">;
-def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">;
-def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">;
+defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa", int_aarch64_sme_mopa_wide>;
+defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops", int_aarch64_sme_mops_wide>;
+
+defm SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa", int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops", int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa", int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops", int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops", int_aarch64_sme_usmops_wide>;
}
let Predicates = [HasSMEI64] in {
-def SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa">;
-def SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops">;
-def UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa">;
-def UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops">;
-def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">;
-def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">;
-def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">;
-def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">;
+defm SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa", int_aarch64_sme_smopa_wide>;
+defm SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops", int_aarch64_sme_smops_wide>;
+defm UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa", int_aarch64_sme_umopa_wide>;
+defm UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops", int_aarch64_sme_umops_wide>;
+defm SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa", int_aarch64_sme_sumopa_wide>;
+defm SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops", int_aarch64_sme_sumops_wide>;
+defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme_usmopa_wide>;
+defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>;
}
let Predicates = [HasSME] in {
@@ -129,15 +138,21 @@ def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>;
def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>;
def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>;
+// Read and write TPIDR2_EL0
+def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
+ (MSR 0xde85, GPR64:$val)>;
+def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
+ (MRS 0xde85)>;
+
//===----------------------------------------------------------------------===//
// SVE2 instructions
//===----------------------------------------------------------------------===//
-def REVD_ZPmZ : sve2_int_perm_revd<"revd">;
+defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>;
-defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>;
-defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>;
+defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp>;
+defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp>;
-defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">;
+defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
} // End let Predicates = [HasSME]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1d162610de9c..68ff1b78e84b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -165,8 +165,8 @@ def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
def SDT_AArch64Arith : SDTypeProfile<1, 3, [
- SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
- SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>
+ SDTCisVec<0>, SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>,
+ SDTCisSameAs<2,3>, SDTCisSameNumEltsAs<0,1>
]>;
def SDT_AArch64FMA : SDTypeProfile<1, 4, [
@@ -175,7 +175,6 @@ def SDT_AArch64FMA : SDTypeProfile<1, 4, [
]>;
// Predicated operations with the result of inactive lanes being unspecified.
-def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>;
def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
@@ -194,7 +193,6 @@ def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
-def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>;
def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>;
def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
@@ -235,6 +233,7 @@ def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch
def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>;
def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>;
def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revd_mt : SDNode<"AArch64ISD::REVD_MERGE_PASSTHRU", SDT_AArch64Arith>;
// These are like the above but we don't yet have need for ISD nodes. They allow
// a single pattern to match intrinsic and ISD operand layouts.
@@ -242,6 +241,26 @@ def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_
def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>;
def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>;
+def AArch64fmul_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fmul, AArch64fmul_p>;
+def AArch64fadd_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fadd, AArch64fadd_p>;
+def AArch64fsub_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fsub, AArch64fsub_p>;
+
+def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3),
+ (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
+def AArch64uaba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(int_aarch64_sve_uaba node:$op1, node:$op2, node:$op3),
+ (add node:$op1, (AArch64uabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
+def AArch64usra : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(int_aarch64_sve_usra node:$op1, node:$op2, node:$op3),
+ (add node:$op1, (AArch64lsr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>;
+
+def AArch64ssra : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+ [(int_aarch64_sve_ssra node:$op1, node:$op2, node:$op3),
+ (add node:$op1, (AArch64asr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>;
+
def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
SDTCVecEltisVT<1,i1>
@@ -282,6 +301,14 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2),
(AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>;
+// FMAs with a negated multiplication operand can be commuted.
+def AArch64fmls_p : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),
+ [(AArch64fma_p node:$pred, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op2, node:$op3),
+ (AArch64fma_p node:$pred, node:$op2, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op3)]>;
+
+def AArch64fsubr_p : PatFrag<(ops node:$pg, node:$op1, node:$op2),
+ (AArch64fsub_p node:$pg, node:$op2, node:$op1)>;
+
def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt),
(AArch64fneg_mt node:$pred, node:$op, node:$pt), [{
return N->getFlags().hasNoSignedZeros();
@@ -295,11 +322,14 @@ def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [
def AArch64bic_node : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>;
def AArch64bic : PatFrags<(ops node:$op1, node:$op2),
- [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))),
- (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))),
+ [(and node:$op1, (xor node:$op2, (splat_vector (i32 -1)))),
+ (and node:$op1, (xor node:$op2, (splat_vector (i64 -1)))),
(and node:$op1, (xor node:$op2, (SVEAllActive))),
(AArch64bic_node node:$op1, node:$op2)]>;
+def AArch64subr : PatFrag<(ops node:$op1, node:$op2),
+ (sub node:$op2, node:$op1)>;
+
let Predicates = [HasSVE] in {
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
@@ -308,7 +338,7 @@ let Predicates = [HasSVE] in {
def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>;
defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
@@ -325,25 +355,27 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
- defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;
- defm SUB_ZPZZ : sve_int_bin_pred_bhsd<AArch64sub_p>;
-} // End HasSVEorStreamingSVE
+ defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", int_aarch64_sve_orr, DestructiveBinaryComm>;
+ defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", int_aarch64_sve_eor, DestructiveBinaryComm>;
+ defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", "AND_ZPZZ", int_aarch64_sve_and, DestructiveBinaryComm>;
+ defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", "BIC_ZPZZ", int_aarch64_sve_bic, DestructiveBinary>;
+} // End HasSVEorSME
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
-let Predicates = [HasSVEorStreamingSVE] in {
- defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
- defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
- defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
- defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
+ defm ORR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_orr>;
+ defm EOR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>;
+ defm AND_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>;
+ defm BIC_ZPZZ : sve_int_bin_pred_zeroing_bhsd<null_frag>;
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
+let Predicates = [HasSVEorSME] in {
defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>;
defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>;
- defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
+ defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr", AArch64subr>;
defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
@@ -440,11 +472,11 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", "FMINNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>;
defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", "FMAX_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>;
defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", "FMIN_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
-
+
defm FADD_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fadd_p>;
defm FSUB_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsub_p>;
defm FMUL_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_two, fpimm_half, fpimm_two, AArch64fmul_p>;
- defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one>;
+ defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsubr_p>;
defm FMAXNM_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmaxnm_p>;
defm FMINNM_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fminnm_p>;
defm FMAX_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmax_p>;
@@ -461,9 +493,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FMIN_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
}
- defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
- defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
- defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
+ defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
+ defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
+ defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
@@ -484,9 +516,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmin_p>;
defm FABD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fabd_p>;
defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
@@ -499,28 +531,28 @@ let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>;
defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>;
defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", AArch64frecps>;
defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
@@ -545,7 +577,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
(!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
// Zd = Za + -Zn * Zm
- def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)),
+ def : Pat<(Ty (AArch64fmls_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)),
(!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
// Zd = -Za + Zn * Zm
@@ -576,26 +608,26 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm : fma<nxv4f32, nxv4i1, "S">;
defm : fma<nxv2f32, nxv2i1, "S">;
defm : fma<nxv2f64, nxv2i1, "D">;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;
defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
// SVE floating point reductions.
defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>;
defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
@@ -613,7 +645,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
// Splat scalar register (unpredicated, GPR or vector + element index)
- defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
+ defm DUP_ZR : sve_int_perm_dup_r<"dup", splat_vector>;
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
@@ -621,61 +653,67 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
// Duplicate FP scalar into all vector elements
- def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
+ def : Pat<(nxv8f16 (splat_vector (f16 FPR16:$src))),
(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
- def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
+ def : Pat<(nxv4f16 (splat_vector (f16 FPR16:$src))),
(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
- def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
+ def : Pat<(nxv2f16 (splat_vector (f16 FPR16:$src))),
(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
- def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
+ def : Pat<(nxv4f32 (splat_vector (f32 FPR32:$src))),
(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
- def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
+ def : Pat<(nxv2f32 (splat_vector (f32 FPR32:$src))),
(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
- def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
+ def : Pat<(nxv2f64 (splat_vector (f64 FPR64:$src))),
(DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
- def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+ def : Pat<(nxv8bf16 (splat_vector (bf16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ def : Pat<(nxv4bf16 (splat_vector (bf16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+ def : Pat<(nxv2bf16 (splat_vector (bf16 FPR16:$src))),
(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
// Duplicate +0.0 into all vector elements
- def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
- def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
- def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
- def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
- def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
- def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
- def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv8f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv4f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv2f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv4f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+ def : Pat<(nxv2f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+ def : Pat<(nxv2f64 (splat_vector (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+ def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+ def : Pat<(nxv2bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
// Duplicate Int immediate into all vector elements
- def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+ def : Pat<(nxv16i8 (splat_vector (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)))),
(DUP_ZI_B $a, $b)>;
- def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+ def : Pat<(nxv8i16 (splat_vector (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)))),
(DUP_ZI_H $a, $b)>;
- def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
+ def : Pat<(nxv4i32 (splat_vector (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)))),
(DUP_ZI_S $a, $b)>;
- def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))),
+ def : Pat<(nxv2i64 (splat_vector (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)))),
(DUP_ZI_D $a, $b)>;
// Duplicate immediate FP into all vector elements.
- def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+ def : Pat<(nxv2f32 (splat_vector (f32 fpimm:$val))),
(DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
- def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+ def : Pat<(nxv4f32 (splat_vector (f32 fpimm:$val))),
(DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
- def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+ def : Pat<(nxv2f64 (splat_vector (f64 fpimm:$val))),
(DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
// Duplicate FP immediate into all vector elements
let AddedComplexity = 2 in {
- def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
+ def : Pat<(nxv8f16 (splat_vector fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
- def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
+ def : Pat<(nxv4f16 (splat_vector fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
- def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
+ def : Pat<(nxv2f16 (splat_vector fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
- def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
+ def : Pat<(nxv4f32 (splat_vector fpimm32:$imm8)),
(FDUP_ZI_S fpimm32:$imm8)>;
- def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
+ def : Pat<(nxv2f32 (splat_vector fpimm32:$imm8)),
(FDUP_ZI_S fpimm32:$imm8)>;
- def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
+ def : Pat<(nxv2f64 (splat_vector fpimm64:$imm8)),
(FDUP_ZI_D fpimm64:$imm8)>;
}
@@ -683,13 +721,13 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
@@ -710,16 +748,21 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
+ // Define pattern for `nxv1i1 splat_vector(1)`.
+ // We do this here instead of in ISelLowering such that PatFrag's can still
+ // recognize a splat.
+ def : Pat<(nxv1i1 immAllOnesV), (PUNPKLO_PP (PTRUE_D 31))>;
+
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>;
defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>;
@@ -831,7 +874,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
// non-faulting continuous load with reg+immediate
@@ -871,7 +914,7 @@ let Predicates = [HasSVE] in {
defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
// LD(2|3|4) structured loads with reg+immediate
defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
@@ -899,7 +942,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
// Gathers using unscaled 32-bit offsets, e.g.
@@ -1013,9 +1056,95 @@ let Predicates = [HasSVE] in {
defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+
+ multiclass sve_masked_gather_x2_scaled<ValueType Ty, SDPatternOperator Load, string Inst> {
+ // base + vector of scaled offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+ (!cast<Instruction>(Inst # _SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of signed 32bit scaled offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+ (!cast<Instruction>(Inst # _SXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of unsigned 32bit scaled offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+ (!cast<Instruction>(Inst # _UXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ multiclass sve_masked_gather_x2_unscaled<ValueType Ty, SDPatternOperator Load, string Inst, Operand ImmTy> {
+ // vector of pointers + immediate offset (includes zero)
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs))),
+ (!cast<Instruction>(Inst # _IMM) PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
+ // base + vector of offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+ (!cast<Instruction>(Inst) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of signed 32bit offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+ (!cast<Instruction>(Inst # _SXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of unsigned 32bit offsets
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+ (!cast<Instruction>(Inst # _UXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ multiclass sve_masked_gather_x4<ValueType Ty, SDPatternOperator Load, Instruction Inst> {
+ def : Pat<(Ty (Load (SVEDup0Undef), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs))),
+ (Inst PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ defm : sve_masked_gather_x2_scaled<nxv2i64, azext_masked_gather_i16_signed_scaled, "GLD1H_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2i64, sext_masked_gather_i16_signed_scaled, "GLD1SH_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2i64, azext_masked_gather_i32_signed_scaled, "GLD1W_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2i64, sext_masked_gather_i32_signed_scaled, "GLD1SW_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2i64, nonext_masked_gather_signed_scaled, "GLD1D">;
+ defm : sve_masked_gather_x2_scaled<nxv2f16, nonext_masked_gather_signed_scaled, "GLD1H_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2f32, nonext_masked_gather_signed_scaled, "GLD1W_D">;
+ defm : sve_masked_gather_x2_scaled<nxv2f64, nonext_masked_gather_signed_scaled, "GLD1D">;
+ defm : sve_masked_gather_x2_scaled<nxv2bf16, nonext_masked_gather_signed_scaled, "GLD1H_D">;
+
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i8_signed_unscaled, "GLD1B_D" , imm0_31>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i8_signed_unscaled, "GLD1SB_D", imm0_31>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i16_signed_unscaled, "GLD1H_D", uimm5s2>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i16_signed_unscaled, "GLD1SH_D", uimm5s2>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i32_signed_unscaled, "GLD1W_D", uimm5s4>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i32_signed_unscaled, "GLD1SW_D", uimm5s4>;
+ defm : sve_masked_gather_x2_unscaled<nxv2i64, nonext_masked_gather_signed_unscaled, "GLD1D", uimm5s8>;
+ defm : sve_masked_gather_x2_unscaled<nxv2f16, nonext_masked_gather_signed_unscaled, "GLD1H_D", uimm5s2>;
+ defm : sve_masked_gather_x2_unscaled<nxv2f32, nonext_masked_gather_signed_unscaled, "GLD1W_D", uimm5s4>;
+ defm : sve_masked_gather_x2_unscaled<nxv2f64, nonext_masked_gather_signed_unscaled, "GLD1D", uimm5s8>;
+ defm : sve_masked_gather_x2_unscaled<nxv2bf16, nonext_masked_gather_signed_unscaled, "GLD1H_D", uimm5s2>;
+
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_signed_scaled, GLD1H_S_SXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_signed_scaled, GLD1SH_S_SXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_signed_scaled, GLD1W_SXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_signed_scaled, GLD1H_S_SXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_signed_scaled, GLD1W_SXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_scaled, GLD1H_S_SXTW_SCALED>;
+
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i8_signed_unscaled, GLD1B_S_SXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i8_signed_unscaled, GLD1SB_S_SXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_signed_unscaled, GLD1H_S_SXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_signed_unscaled, GLD1SH_S_SXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_signed_unscaled, GLD1W_SXTW>;
+ defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_signed_unscaled, GLD1H_S_SXTW>;
+ defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_signed_unscaled, GLD1W_SXTW>;
+ defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_unscaled, GLD1H_S_SXTW>;
+
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_unsigned_scaled, GLD1H_S_UXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_unsigned_scaled, GLD1SH_S_UXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_unsigned_scaled, GLD1W_UXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_unsigned_scaled, GLD1H_S_UXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_unsigned_scaled, GLD1W_UXTW_SCALED>;
+ defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_scaled, GLD1H_S_UXTW_SCALED>;
+
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i8_unsigned_unscaled, GLD1B_S_UXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i8_unsigned_unscaled, GLD1SB_S_UXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_unsigned_unscaled, GLD1H_S_UXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_unsigned_unscaled, GLD1SH_S_UXTW>;
+ defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_unsigned_unscaled, GLD1W_UXTW>;
+ defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_unsigned_unscaled, GLD1H_S_UXTW>;
+ defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_unsigned_unscaled, GLD1W_UXTW>;
+ defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_unscaled, GLD1H_S_UXTW>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
@@ -1051,7 +1180,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
// Scatters using unpacked, unscaled 32-bit offsets, e.g.
@@ -1100,12 +1229,87 @@ let Predicates = [HasSVE] in {
// Scatters using scaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, lsl #1]
- defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
- defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
- defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
+ defm SST1H_D : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm SST1W_D : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm SST1D : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
+
+ multiclass sve_masked_scatter_x2_scaled<ValueType Ty, SDPatternOperator Store, string Inst> {
+ // base + vector of scaled offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+ (!cast<Instruction>(Inst # _SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of signed 32bit scaled offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+ (!cast<Instruction>(Inst # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of unsigned 32bit scaled offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+ (!cast<Instruction>(Inst # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ multiclass sve_masked_scatter_x2_unscaled<ValueType Ty, SDPatternOperator Store, string Inst, Operand ImmTy> {
+ // vector of pointers + immediate offset (includes zero)
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs)),
+ (!cast<Instruction>(Inst # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
+ // base + vector of offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+ (!cast<Instruction>(Inst) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of signed 32bit offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+ (!cast<Instruction>(Inst # _SXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ // base + vector of unsigned 32bit offsets
+ def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+ (!cast<Instruction>(Inst # _UXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ multiclass sve_masked_scatter_x4<ValueType Ty, SDPatternOperator Store, Instruction Inst> {
+ def : Pat<(Store (Ty ZPR:$data), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs)),
+ (Inst ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
+ }
+
+ defm : sve_masked_scatter_x2_scaled<nxv2i64, trunc_masked_scatter_i16_signed_scaled, "SST1H_D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2i64, trunc_masked_scatter_i32_signed_scaled, "SST1W_D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2i64, nontrunc_masked_scatter_signed_scaled, "SST1D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2f16, nontrunc_masked_scatter_signed_scaled, "SST1H_D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2f32, nontrunc_masked_scatter_signed_scaled, "SST1W_D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2f64, nontrunc_masked_scatter_signed_scaled, "SST1D">;
+ defm : sve_masked_scatter_x2_scaled<nxv2bf16, nontrunc_masked_scatter_signed_scaled, "SST1H_D">;
+
+ defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i8_signed_unscaled, "SST1B_D" , imm0_31>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i16_signed_unscaled, "SST1H_D", uimm5s2>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i32_signed_unscaled, "SST1W_D", uimm5s4>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2i64, nontrunc_masked_scatter_signed_unscaled, "SST1D", uimm5s8>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2f16, nontrunc_masked_scatter_signed_unscaled, "SST1H_D", uimm5s2>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2f32, nontrunc_masked_scatter_signed_unscaled, "SST1W_D", uimm5s4>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2f64, nontrunc_masked_scatter_signed_unscaled, "SST1D", uimm5s8>;
+ defm : sve_masked_scatter_x2_unscaled<nxv2bf16, nontrunc_masked_scatter_signed_unscaled, "SST1H_D", uimm5s2>;
+
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_signed_scaled, SST1H_S_SXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_signed_scaled, SST1W_SXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_signed_scaled, SST1H_S_SXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_signed_scaled, SST1W_SXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_scaled, SST1H_S_SXTW_SCALED>;
+
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i8_signed_unscaled, SST1B_S_SXTW>;
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_signed_unscaled, SST1H_S_SXTW>;
+ defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_signed_unscaled, SST1W_SXTW>;
+ defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_signed_unscaled, SST1H_S_SXTW>;
+ defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_signed_unscaled, SST1W_SXTW>;
+ defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_unscaled, SST1H_S_SXTW>;
+
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_unsigned_scaled, SST1H_S_UXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_unsigned_scaled, SST1W_UXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_unsigned_scaled, SST1H_S_UXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_unsigned_scaled, SST1W_UXTW_SCALED>;
+ defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_scaled, SST1H_S_UXTW_SCALED>;
+
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i8_unsigned_unscaled, SST1B_S_UXTW>;
+ defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_unsigned_unscaled, SST1H_S_UXTW>;
+ defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_unsigned_unscaled, SST1W_UXTW>;
+ defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_unsigned_unscaled, SST1H_S_UXTW>;
+ defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_unsigned_unscaled, SST1W_UXTW>;
+ defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_unscaled, SST1H_S_UXTW>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
// ST(2|3|4) structured stores (register + immediate)
defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
@@ -1161,7 +1365,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
// Contiguous prefetch (register + register)
def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
- def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
+ def PRFW_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, ComplexPattern AddrCP> {
@@ -1184,9 +1388,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, am_sve_regreg_lsl0>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, am_sve_regreg_lsl1>;
- defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, am_sve_regreg_lsl2>;
+ defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFW_PRR, am_sve_regreg_lsl2>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, am_sve_regreg_lsl3>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
// Gather prefetch using scaled 32-bit offsets, e.g.
@@ -1249,7 +1453,7 @@ let Predicates = [HasSVE] in {
// Patterns to generate adr instruction.
// adr z0.d, [z0.d, z0.d, uxtw]
def : Pat<(add nxv2i64:$Op1,
- (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))))),
+ (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
(ADR_UXTW_ZZZ_D_0 $Op1, $Op2)>;
// adr z0.d, [z0.d, z0.d, sxtw]
def : Pat<(add nxv2i64:$Op1,
@@ -1262,7 +1466,7 @@ let Predicates = [HasSVE] in {
def : Pat<(add Ty:$Op1,
(Ty (AArch64lsl_p (PredTy (SVEAllActive)),
Ty:$Op2,
- (Ty (AArch64dup (ShiftTy ShiftAmt)))))),
+ (Ty (splat_vector (ShiftTy ShiftAmt)))))),
(DestAdrIns $Op1, $Op2)>;
}
defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_1, 1>;
@@ -1277,14 +1481,14 @@ let Predicates = [HasSVE] in {
multiclass adrXtwShiftPat<ValueType Ty, ValueType PredTy, int ShiftAmt> {
def : Pat<(add Ty:$Op1,
(Ty (AArch64lsl_p (PredTy (SVEAllActive)),
- (Ty (and Ty:$Op2, (Ty (AArch64dup (i64 0xFFFFFFFF))))),
- (Ty (AArch64dup (i64 ShiftAmt)))))),
+ (Ty (and Ty:$Op2, (Ty (splat_vector (i64 0xFFFFFFFF))))),
+ (Ty (splat_vector (i64 ShiftAmt)))))),
(!cast<Instruction>("ADR_UXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
def : Pat<(add Ty:$Op1,
(Ty (AArch64lsl_p (PredTy (SVEAllActive)),
(Ty (sext_inreg Ty:$Op2, nxv2i32)),
- (Ty (AArch64dup (i64 ShiftAmt)))))),
+ (Ty (splat_vector (i64 ShiftAmt)))))),
(!cast<Instruction>("ADR_SXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
}
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 1>;
@@ -1292,7 +1496,7 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
@@ -1310,6 +1514,10 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
// Extract lo/hi halves of legal predicate types.
+ def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 0))),
+ (PUNPKLO_PP PPR:$Ps)>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 1))),
+ (PUNPKHI_PP PPR:$Ps)>;
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
(PUNPKLO_PP PPR:$Ps)>;
def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
@@ -1400,6 +1608,8 @@ let Predicates = [HasSVEorStreamingSVE] in {
(UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
// Concatenate two predicates.
+ def : Pat<(nxv2i1 (concat_vectors nxv1i1:$p1, nxv1i1:$p2)),
+ (UZP1_PPP_D $p1, $p2)>;
def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
(UZP1_PPP_S $p1, $p2)>;
def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
@@ -1475,7 +1685,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
- defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETUNE, SETNE, SETUNE, SETNE>;
defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
@@ -1485,7 +1695,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>;
defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>;
defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
- defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+ defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETUNE, SETNE, SETUNE, SETNE>;
defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
@@ -1522,7 +1732,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>;
defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>;
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
@@ -1619,16 +1829,16 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
-let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>;
-} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVEorSME, UseExperimentalZeroingPseudos
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
@@ -1679,60 +1889,61 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
- def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
- (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ //These patterns exist to improve the code quality of conversions on unpacked types.
+ def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+ (FCVT_ZPmZ_HtoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
// FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
// This is ignored by the pattern below where it is matched by (i64 timm0_1)
- def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
- (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+ (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- // Floating-point -> signed integer
- def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ // Signed integer -> Floating-point
+ def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))),
- (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg),
+ def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 (SVEAllActive):$Pg),
(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))),
- (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))),
- (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (SCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))),
- (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (SCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))),
- (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (SCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- // Floating-point -> unsigned integer
- def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ // Unsigned integer -> Floating-point
+ def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(and (nxv2i64 ZPR:$Zs),
- (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
- (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (nxv2i64 (splat_vector (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(and (nxv2i64 ZPR:$Zs),
- (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
- (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg),
+ def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive):$Pg),
(and (nxv4i32 ZPR:$Zs),
- (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
- (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (nxv4i32 (splat_vector (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(and (nxv2i64 ZPR:$Zs),
- (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
- (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
- def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(and (nxv2i64 ZPR:$Zs),
- (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
- (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+ (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>;
defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>;
@@ -1743,27 +1954,27 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
-let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
+let Predicates = [HasBF16, HasSVEorSME] in {
defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
-} // End HasBF16, HasSVEorStreamingSVE
+} // End HasBF16, HasSVEorSME
let Predicates = [HasBF16, HasSVE] in {
defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
} // End HasBF16, HasSVE
-let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
- defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
- defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
- defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
- defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+let Predicates = [HasBF16, HasSVEorSME] in {
+ defm BFMLALB_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+ defm BFMLALT_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+ defm BFMLALB_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+ defm BFMLALT_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
-} // End HasBF16, HasSVEorStreamingSVE
+} // End HasBF16, HasSVEorSME
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
// InstAliases
def : InstAlias<"mov $Zd, $Zn",
(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -1875,7 +2086,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
let AddedComplexity = 1 in {
class LD1RPat<ValueType vt, SDPatternOperator operator,
Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
- Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
+ Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
(load (ptrue 31), GPR64:$base, $offset)>;
}
@@ -1963,22 +2174,22 @@ let Predicates = [HasSVEorStreamingSVE] in {
GPR32:$op, sub_32), $imm),
sub_32))>;
- def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+ def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
(INCH_ZPiI ZPR:$op, 31, $imm)>;
- def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+ def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
(INCW_ZPiI ZPR:$op, 31, $imm)>;
- def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+ def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))),
(INCD_ZPiI ZPR:$op, 31, $imm)>;
- def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+ def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
(DECH_ZPiI ZPR:$op, 31, $imm)>;
- def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+ def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
(DECW_ZPiI ZPR:$op, 31, $imm)>;
- def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+ def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))),
(DECD_ZPiI ZPR:$op, 31, $imm)>;
}
- let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL], AddedComplexity = 5 in {
+ let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
(INCH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2098,15 +2309,23 @@ let Predicates = [HasSVEorStreamingSVE] in {
def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv16i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv8i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv4i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv2i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv1i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv1i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv1i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv1i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
// These allow casting from/to unpacked floating-point types.
def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
@@ -2145,12 +2364,12 @@ let Predicates = [HasSVEorStreamingSVE] in {
}
// 2-element contiguous loads
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv2bf16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
@@ -2158,18 +2377,18 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous loads
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, azext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, sext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, azext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, sext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4bf16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous loads
- defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, azext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, sext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
@@ -2397,7 +2616,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
// 16-element contiguous loads
defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE] in {
multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
@@ -2482,7 +2701,7 @@ let Predicates = [HasSVE] in {
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
} // End HasSVE
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
// reg + reg
@@ -2716,7 +2935,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
(f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
}
-} // End HasSVEorStreamingSVE
+} // End HasSVEorSME
let Predicates = [HasSVE, HasMatMulInt8] in {
defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
@@ -2724,11 +2943,11 @@ let Predicates = [HasSVE, HasMatMulInt8] in {
defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
} // End HasSVE, HasMatMulInt8
-let Predicates = [HasSVEorStreamingSVE, HasMatMulInt8] in {
+let Predicates = [HasSVEorSME, HasMatMulInt8] in {
defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
-} // End HasSVEorStreamingSVE, HasMatMulInt8
+} // End HasSVEorSME, HasMatMulInt8
let Predicates = [HasSVE, HasMatMulFP32] in {
defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
@@ -2746,16 +2965,16 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>;
} // End HasSVE, HasMatMulFP64
-let Predicates = [HasSVEorStreamingSVE, HasMatMulFP64] in {
+let Predicates = [HasSVEorSME, HasMatMulFP64] in {
defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
-} // End HasSVEorStreamingSVE, HasMatMulFP64
+} // End HasSVEorSME, HasMatMulFP64
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 integer multiply-add (indexed)
defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
@@ -2903,17 +3122,17 @@ let Predicates = [HasSVE2orStreamingSVE] in {
defm UQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
defm SQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
defm UQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
-let Predicates = [HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
-} // End HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos
+} // End HasSVE2orSME, UseExperimentalZeroingPseudos
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 predicated shifts
defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl", "SQSHL_ZPZI", int_aarch64_sve_sqshl>;
defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl", "UQSHL_ZPZI", int_aarch64_sve_uqshl>;
@@ -2960,18 +3179,18 @@ let Predicates = [HasSVE2orStreamingSVE] in {
defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
// SVE2 bitwise shift right and accumulate
- defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>;
- defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>;
- defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
- defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;
+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", AArch64ssra>;
+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", AArch64usra>;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra, int_aarch64_sve_srshr>;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra, int_aarch64_sve_urshr>;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
// SVE2 integer absolute difference and accumulate
- defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
- defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
+ defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", AArch64saba>;
+ defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", AArch64uaba>;
// SVE2 integer absolute difference and accumulate long
defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
@@ -3026,7 +3245,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>;
defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>;
defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
// SVE2 character match
@@ -3034,7 +3253,7 @@ let Predicates = [HasSVE2] in {
defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
} // End HasSVE2
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 bitwise exclusive-or interleaved
defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
@@ -3049,7 +3268,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
// SVE2 histogram generation (segment)
@@ -3059,7 +3278,7 @@ let Predicates = [HasSVE2] in {
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
} // End HasSVE2
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 floating-point base 2 logarithm as integer
defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
@@ -3091,7 +3310,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
// SVE2 bitwise ternary operations
defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>;
defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
- defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>;
+ defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>;
defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>;
@@ -3101,7 +3320,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
// SVE2 non-temporal gather loads
@@ -3120,10 +3339,10 @@ let Predicates = [HasSVE2] in {
defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>;
} // End HasSVE2
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
// SVE2 non-temporal scatter stores
@@ -3137,7 +3356,7 @@ let Predicates = [HasSVE2] in {
defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
} // End HasSVE2
-let Predicates = [HasSVE2orStreamingSVE] in {
+let Predicates = [HasSVE2orSME] in {
// SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
@@ -3156,7 +3375,7 @@ let Predicates = [HasSVE2orStreamingSVE] in {
// SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
-} // End HasSVE2orStreamingSVE
+} // End HasSVE2orSME
let Predicates = [HasSVE2AES] in {
// SVE2 crypto destructive binary operations
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 009219ce3c54..c6b112d0d2f1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -6,7 +6,10 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the machine model for the ARM Cortex-A55 processors.
+// This file defines the machine model for the ARM Cortex-A55 processors. Note
+// that this schedule is currently used as the default for -mcpu=generic. As a
+// result, some of the modelling decision made do not precisely model the
+// Cortex-A55, instead aiming to be a good compromise between different cpus.
//
//===----------------------------------------------------------------------===//
@@ -149,8 +152,31 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
-def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
-def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
+
+// NEON
+class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+ let Latency = n;
+}
+class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> {
+ let Latency = n;
+ let BeginGroup = 1;
+}
+def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>;
+def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>;
+def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>;
// FP ALU specific new schedwrite definitions
def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
@@ -358,4 +384,99 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+// 4.15. Advanced SIMD integer instructions
+// ASIMD absolute diff
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute diff accum
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute diff long
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
+ "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
+ "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
+// ASIMD arith #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+ "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
+ "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
+ "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+ "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
+ "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
+ "ADDPv(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "SADDLv", "UADDLv", "SADDWv",
+ "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ADDVv", "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8",
+ "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8",
+ "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+ "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>;
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[CortexA55WriteMlaVd_4], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteMlaVq_4], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[CortexA55WriteMlaIxVq_4], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[CortexA55WriteMlaLVq_4], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[CortexA55WriteDotVd_4], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[CortexA55WriteDotVq_4], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[CortexA55WriteDotScVq_4], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ADALPv")>;
+// ASIMD shift accumulate
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+// ASIMD shift accumulate #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]RSRA[vd]")>;
+// ASIMD shift by immed
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv",
+ "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
+// ASIMD shift by immed
+// SXTL and UXTL are aliases for SHLL
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>;
+// ASIMD shift by immed #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
+ "RSHRNv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)",
+ "RSHRNv(16i8|4i32|8i16)")>;
+// ASIMD shift by register
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
+// ASIMD shift by register #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
+
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index fa10d056b7f7..6b053f1969b4 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
list<Predicate> UnsupportedFeatures =
[HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
- HasSVE2orStreamingSVE];
+ HasSVE2orSME];
let FullInstRWOverlapCheck = 0;
}
@@ -3348,7 +3348,7 @@ def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>;
def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>;
// [351] "prfw $prfop, $Pg, [$Rn, $Rm]";
-def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>;
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRR)>;
// [352] "prfw $prfop, $Pg, [$Rn, $Zm]";
def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>;
@@ -3554,7 +3554,7 @@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>;
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>;
// [421] "st1b $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
// [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]";
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>;
@@ -3566,7 +3566,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>;
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>;
// [425] "st1d $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D, SST1D_SCALED, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
// [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]";
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>;
@@ -3578,7 +3578,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>;
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>;
// [429] "st1h $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D, SST1H_D_SCALED, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
// [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]";
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>;
@@ -3590,7 +3590,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>;
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>;
// [433] "st1w $Zt, $Pg, [$Rn, $Zm]";
-def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D, SST1W_D_SCALED, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
// [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]";
def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
new file mode 100644
index 000000000000..32f7299fbf87
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -0,0 +1,1136 @@
+//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1 to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture. The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch. Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1Model : SchedMachineModel {
+ let IssueWidth = 4; // 4-way decode and dispatch
+ let MicroOpBufferSize = 174; // micro-op re-order buffer size
+ let LoadLatency = 4; // Optimistic load latency
+ let MispredictPenalty = 10; // Branch mispredict penalty
+ let LoopMicroOpBufferSize = 32; // Instruction queue size
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ SMEUnsupported.F);
+}
+
+let SchedModel = Ampere1Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into. The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w
+def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts
+def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle
+def Ampere1UnitL : ProcResource<2>; // load
+def Ampere1UnitS : ProcResource<2>; // store address calculation
+def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write
+def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto
+def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves
+
+def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
+def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+ Ampere1UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+ Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
+ Ampere1UnitAB]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 9;
+}
+
+def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 7;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
+ Ampere1UnitA]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
+
+def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 9;
+ let NumMicroOps = 14;
+}
+
+def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitS, Ampere1UnitS,
+ Ampere1UnitZ, Ampere1UnitZ,
+ Ampere1UnitZ, Ampere1UnitZ]> {
+ let Latency = 9;
+ let NumMicroOps = 16;
+}
+
+def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 6;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 11;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitL, Ampere1UnitL,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+}
+
+def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+ Ampere1UnitXY, Ampere1UnitXY]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+}
+
+def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 18;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 19;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 25;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 32;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+ let Latency = 34;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 34;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 39;
+ let NumMicroOps = 1;
+}
+
+def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+ let Latency = 62;
+ let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1Write_Arith : SchedWriteVariant<[
+ SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+ SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1AB]>,
+ SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
+ SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
+ SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1A]>,
+ SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1A]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm, [Ampere1UnitAB]>; // MOVN, MOVZ
+def : WriteRes<WriteI, [Ampere1UnitAB]>; // ALU
+def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+} // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+} // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [Ampere1UnitB]>; // EXTR shifts a reg pair
+def : WriteRes<WriteIS, [Ampere1UnitB]>; // Shift/Scale
+def : WriteRes<WriteID32, [Ampere1UnitBS]> {
+ let Latency = 18;
+} // 32-bit Divide
+def : WriteRes<WriteID64, [Ampere1UnitBS]> {
+ let Latency = 34;
+} // 64-bit Divide
+def : WriteRes<WriteIM32, [Ampere1UnitBS]> {
+ let Latency = 3;
+} // 32-bit Multiply
+def : WriteRes<WriteIM64, [Ampere1UnitBS]> {
+ let Latency = 3;
+} // 32-bit Multiply
+def : WriteRes<WriteBr, [Ampere1UnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD, [Ampere1UnitL]> {
+ let Latency = 4;
+} // Load from base addr plus immediate offset
+def : WriteRes<WriteST, [Ampere1UnitS]> {
+ let Latency = 1;
+} // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP, [Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+} // Store a register pair.
+def : WriteRes<WriteAdr, [Ampere1UnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+} // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+} // Store to a register index (maybe scaled).
+def : WriteRes<WriteF, [Ampere1UnitXY]> {
+ let Latency = 2;
+} // General floating-point ops.
+def : WriteRes<WriteFCmp, [Ampere1UnitX]> {
+ let Latency = 5;
+} // Floating-point compare.
+def : WriteRes<WriteFCvt, [Ampere1UnitXY]> {
+ let Latency = 6;
+} // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
+} // Float-int register copy.
+def : WriteRes<WriteFImm, [Ampere1UnitXY]> {
+ let Latency = 2;
+} // Float-int register copy.
+def : WriteRes<WriteFMul, [Ampere1UnitXY]> {
+ let Latency = 5;
+} // Floating-point multiply.
+def : WriteRes<WriteFDiv, [Ampere1UnitXY]> {
+ let Latency = 34;
+} // Floating-point division.
+def : WriteRes<WriteVd, [Ampere1UnitXY]> {
+ let Latency = 3;
+} // 64bit Vector D ops.
+def : WriteRes<WriteVq, [Ampere1UnitXY]> {
+ let Latency = 3;
+} // 128bit Vector Q ops.
+def : WriteRes<WriteVLD, [Ampere1UnitL, Ampere1UnitL]> {
+ let Latency = 5;
+} // Vector loads.
+def : WriteRes<WriteVST, [Ampere1UnitS, Ampere1UnitZ]> {
+ let Latency = 2;
+} // Vector stores.
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 4;
+} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1.
+
+def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+ (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+ (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1Write_5cyc_1L],
+ (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1Write_5cyc_1L],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1Write_5cyc_2L],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1Write_6cyc_3L],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1Write_5cyc_2L],
+ (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+ (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1Write_9cyc_2L_3XY],
+ (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_3L],
+ (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_3L_3XY],
+ (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+ (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_10cyc_3L_3XY],
+ (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_4L_4XY],
+ (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+ (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1Write_9cyc_4L_4XY],
+ (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1Write_12cyc_4L_8XY],
+ (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1Write_11cyc_4L_8XY],
+ (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
+ (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1Write_3cyc_2S_2Z],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1Write_4cyc_3S_3Z],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1Write_5cyc_4S_4Z],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+ (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+ (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
+ (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+ (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+ (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+ (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
+ (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+ (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
+ (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1Write_4cyc_1X],
+ (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
+ (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
+ (instregex "^FCSELH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
+def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+ (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1X],
+ (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
+ (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
+ (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1Write_Arith],
+ (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
+def : InstRW<[Ampere1Write_ArithFlagsetting],
+ (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(ADC|SBC)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+ (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1Write_3cyc_1BS],
+ (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1Write_4cyc_1BS],
+ (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1Write_4cyc_2L],
+ (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+ (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_5cyc_1AB_1L],
+ (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1L],
+ (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_1L],
+ (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1Write_1cyc_1AB],
+ (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+ (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+ (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1Write_2cyc_1B_1S],
+ (instrs STPWi, STPXi)>;
+def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
+ (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+ (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+ (instregex "STUR(BB|HH|X|W)i",
+ "STR(X|W)ui",
+ "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
+
+// Pointer authentication
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
+def : InstRW<[Ampere1Write_8cyc_1BS_1A],
+ (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1Write_8cyc_1BS_2A],
+ (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
+def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+ "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+ "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+ "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1Write_12cyc_4XY],
+ (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1Write_12cyc_4XY],
+ (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1Write_9cyc_3XY],
+ (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1Write_9cyc_3XY],
+ (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+ "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY],
+ (instregex "RSHRNv",
+ "SHRNv", "SQSHRNv", "SQSHRUNv",
+ "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1Write_6cyc_2XY],
+ (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+ (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+ "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+ "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1Write_4cyc_2XY],
+ (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1Write_6cyc_3XY],
+ (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1Write_8cyc_4XY],
+ (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+ (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1Model
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
new file mode 100644
index 000000000000..8552c07bda56
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
@@ -0,0 +1,25 @@
+//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Ampere Computing processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check for a LSL shift <= 4
+def AmpereCheapLSL : MCSchedPredicate<
+ CheckAny<[CheckShiftBy0,
+ CheckAll<
+ [CheckShiftLSL,
+ CheckAny<
+ [CheckShiftBy1,
+ CheckShiftBy2,
+ CheckShiftBy3,
+ CheckShiftBy4]>]>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
index fcda2394bacf..ee7cc1f5095b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -109,10 +109,7 @@ def ExynosScaledIdxFn : TIIPredicate<"isExynosScaledAddr",
def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
// Identify FP instructions.
-def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckHForm,
- CheckSForm,
- CheckDForm,
- CheckQForm]>>;
+def ExynosFPPred : MCSchedPredicate<CheckFpOrNEON>;
// Identify 128-bit NEON instructions.
def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index fc13b23b4cf8..4473f3a53845 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -53,152 +53,23 @@ let FunctionMapper = "AArch64_AM::getShiftType" in {
}
// Check for shifting in arithmetic and logic instructions.
-foreach I = {0-3, 8} in {
+foreach I = {0-4, 8} in {
let FunctionMapper = "AArch64_AM::getShiftValue" in
def CheckShiftBy#I : CheckImmOperand<3, I>;
}
// Generic predicates.
-
-// Identify whether an instruction is the 16-bit NEON form based on its result.
-def CheckHForm : CheckAll<[CheckIsRegOperand<0>,
- CheckAny<[CheckRegOperand<0, H0>,
- CheckRegOperand<0, H1>,
- CheckRegOperand<0, H2>,
- CheckRegOperand<0, H3>,
- CheckRegOperand<0, H4>,
- CheckRegOperand<0, H5>,
- CheckRegOperand<0, H6>,
- CheckRegOperand<0, H7>,
- CheckRegOperand<0, H8>,
- CheckRegOperand<0, H9>,
- CheckRegOperand<0, H10>,
- CheckRegOperand<0, H11>,
- CheckRegOperand<0, H12>,
- CheckRegOperand<0, H13>,
- CheckRegOperand<0, H14>,
- CheckRegOperand<0, H15>,
- CheckRegOperand<0, H16>,
- CheckRegOperand<0, H17>,
- CheckRegOperand<0, H18>,
- CheckRegOperand<0, H19>,
- CheckRegOperand<0, H20>,
- CheckRegOperand<0, H21>,
- CheckRegOperand<0, H22>,
- CheckRegOperand<0, H23>,
- CheckRegOperand<0, H24>,
- CheckRegOperand<0, H25>,
- CheckRegOperand<0, H26>,
- CheckRegOperand<0, H27>,
- CheckRegOperand<0, H28>,
- CheckRegOperand<0, H29>,
- CheckRegOperand<0, H30>,
- CheckRegOperand<0, H31>]>]>;
-
-// Identify whether an instruction is the 32-bit NEON form based on its result.
-def CheckSForm : CheckAll<[CheckIsRegOperand<0>,
- CheckAny<[CheckRegOperand<0, S0>,
- CheckRegOperand<0, S1>,
- CheckRegOperand<0, S2>,
- CheckRegOperand<0, S3>,
- CheckRegOperand<0, S4>,
- CheckRegOperand<0, S5>,
- CheckRegOperand<0, S6>,
- CheckRegOperand<0, S7>,
- CheckRegOperand<0, S8>,
- CheckRegOperand<0, S9>,
- CheckRegOperand<0, S10>,
- CheckRegOperand<0, S11>,
- CheckRegOperand<0, S12>,
- CheckRegOperand<0, S13>,
- CheckRegOperand<0, S14>,
- CheckRegOperand<0, S15>,
- CheckRegOperand<0, S16>,
- CheckRegOperand<0, S17>,
- CheckRegOperand<0, S18>,
- CheckRegOperand<0, S19>,
- CheckRegOperand<0, S20>,
- CheckRegOperand<0, S21>,
- CheckRegOperand<0, S22>,
- CheckRegOperand<0, S23>,
- CheckRegOperand<0, S24>,
- CheckRegOperand<0, S25>,
- CheckRegOperand<0, S26>,
- CheckRegOperand<0, S27>,
- CheckRegOperand<0, S28>,
- CheckRegOperand<0, S29>,
- CheckRegOperand<0, S30>,
- CheckRegOperand<0, S31>]>]>;
-
-// Identify whether an instruction is the 64-bit NEON form based on its result.
-def CheckDForm : CheckAll<[CheckIsRegOperand<0>,
- CheckAny<[CheckRegOperand<0, D0>,
- CheckRegOperand<0, D1>,
- CheckRegOperand<0, D2>,
- CheckRegOperand<0, D3>,
- CheckRegOperand<0, D4>,
- CheckRegOperand<0, D5>,
- CheckRegOperand<0, D6>,
- CheckRegOperand<0, D7>,
- CheckRegOperand<0, D8>,
- CheckRegOperand<0, D9>,
- CheckRegOperand<0, D10>,
- CheckRegOperand<0, D11>,
- CheckRegOperand<0, D12>,
- CheckRegOperand<0, D13>,
- CheckRegOperand<0, D14>,
- CheckRegOperand<0, D15>,
- CheckRegOperand<0, D16>,
- CheckRegOperand<0, D17>,
- CheckRegOperand<0, D18>,
- CheckRegOperand<0, D19>,
- CheckRegOperand<0, D20>,
- CheckRegOperand<0, D21>,
- CheckRegOperand<0, D22>,
- CheckRegOperand<0, D23>,
- CheckRegOperand<0, D24>,
- CheckRegOperand<0, D25>,
- CheckRegOperand<0, D26>,
- CheckRegOperand<0, D27>,
- CheckRegOperand<0, D28>,
- CheckRegOperand<0, D29>,
- CheckRegOperand<0, D30>,
- CheckRegOperand<0, D31>]>]>;
+// Identify whether an instruction is NEON or floating point
+def CheckFpOrNEON : CheckFunctionPredicateWithTII<
+ "AArch64_MC::isFpOrNEON",
+ "AArch64InstrInfo::isFpOrNEON"
+>;
// Identify whether an instruction is the 128-bit NEON form based on its result.
-def CheckQForm : CheckAll<[CheckIsRegOperand<0>,
- CheckAny<[CheckRegOperand<0, Q0>,
- CheckRegOperand<0, Q1>,
- CheckRegOperand<0, Q2>,
- CheckRegOperand<0, Q3>,
- CheckRegOperand<0, Q4>,
- CheckRegOperand<0, Q5>,
- CheckRegOperand<0, Q6>,
- CheckRegOperand<0, Q7>,
- CheckRegOperand<0, Q8>,
- CheckRegOperand<0, Q9>,
- CheckRegOperand<0, Q10>,
- CheckRegOperand<0, Q11>,
- CheckRegOperand<0, Q12>,
- CheckRegOperand<0, Q13>,
- CheckRegOperand<0, Q14>,
- CheckRegOperand<0, Q15>,
- CheckRegOperand<0, Q16>,
- CheckRegOperand<0, Q17>,
- CheckRegOperand<0, Q18>,
- CheckRegOperand<0, Q19>,
- CheckRegOperand<0, Q20>,
- CheckRegOperand<0, Q21>,
- CheckRegOperand<0, Q22>,
- CheckRegOperand<0, Q23>,
- CheckRegOperand<0, Q24>,
- CheckRegOperand<0, Q25>,
- CheckRegOperand<0, Q26>,
- CheckRegOperand<0, Q27>,
- CheckRegOperand<0, Q28>,
- CheckRegOperand<0, Q29>,
- CheckRegOperand<0, Q30>,
- CheckRegOperand<0, Q31>]>]>;
+def CheckQForm : CheckFunctionPredicateWithTII<
+ "AArch64_MC::isQForm",
+ "AArch64InstrInfo::isQForm"
+>;
// Identify arithmetic instructions with extend.
def IsArithExtOp : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 77fca22a5f55..6ecfc97a4273 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -25,7 +25,8 @@ def TSV110Model : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ SMEUnsupported.F);
}
// Define each kind of processor resource and number available on the TSV110,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 893269c1a7ef..677797a6797b 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -91,7 +91,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, Align Alignment, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
const AArch64Subtarget &STI =
DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
@@ -100,38 +100,6 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
}
-
- // Check to see if there is a specialized entry-point for memory zeroing.
- ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
- ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
- const char *bzeroName =
- (V && V->isZero())
- ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
- : nullptr;
- // For small size (< 256), it is not beneficial to use bzero
- // instead of memset.
- if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) {
- const AArch64TargetLowering &TLI = *STI.getTargetLowering();
-
- EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
- Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = Dst;
- Entry.Ty = IntPtrTy;
- Args.push_back(Entry);
- Entry.Node = Size;
- Args.push_back(Entry);
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(Chain)
- .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol(bzeroName, IntPtr),
- std::move(Args))
- .setDiscardResult();
- std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
- return CallResult.second;
- }
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 47fe3bf7dcf5..73f93724d6fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -34,7 +34,7 @@ public:
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
- bool isVolatile,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
SDValue
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 566c7a16db23..24816bc9e9bd 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -42,20 +42,23 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
#include <cassert>
#include <iterator>
+#include <memory>
#include <utility>
using namespace llvm;
@@ -63,12 +66,12 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-stack-tagging"
static cl::opt<bool> ClMergeInit(
- "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+ "stack-tagging-merge-init", cl::Hidden, cl::init(true),
cl::desc("merge stack variable initializers with tagging when possible"));
static cl::opt<bool>
ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden,
- cl::init(true), cl::ZeroOrMore,
+ cl::init(true),
cl::desc("Use Stack Safety analysis results"));
static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
@@ -78,6 +81,12 @@ static cl::opt<unsigned>
ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272),
cl::Hidden);
+static cl::opt<size_t> ClMaxLifetimes(
+ "stack-tagging-max-lifetimes-for-alloca", cl::Hidden, cl::init(3),
+ cl::ReallyHidden,
+ cl::desc("How many lifetime ends to handle for a single alloca."),
+ cl::Optional);
+
static const Align kTagGranuleSize = Align(16);
namespace {
@@ -283,15 +292,6 @@ public:
};
class AArch64StackTagging : public FunctionPass {
- struct AllocaInfo {
- AllocaInst *AI;
- TrackingVH<Instruction> OldAI; // Track through RAUW to replace debug uses.
- SmallVector<IntrinsicInst *, 2> LifetimeStart;
- SmallVector<IntrinsicInst *, 2> LifetimeEnd;
- SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
- int Tag; // -1 for non-tagged allocations
- };
-
const bool MergeInit;
const bool UseStackSafety;
@@ -307,7 +307,6 @@ public:
}
bool isInterestingAlloca(const AllocaInst &AI);
- void alignAndPadAlloca(AllocaInfo &Info);
void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr,
uint64_t Size);
@@ -316,9 +315,9 @@ public:
Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
uint64_t Size, InitializerBuilder &IB);
- Instruction *
- insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
- const DominatorTree *DT);
+ Instruction *insertBaseTaggedPointer(
+ const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas,
+ const DominatorTree *DT);
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return "AArch64 Stack Tagging"; }
@@ -419,7 +418,7 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
bool IsInteresting =
AI.getAllocatedType()->isSized() && AI.isStaticAlloca() &&
// alloca() may be called with 0 size, ignore it.
- AI.getAllocationSizeInBits(*DL).getValue() > 0 &&
+ *AI.getAllocationSizeInBits(*DL) > 0 &&
// inalloca allocas are not treated as static, and we don't want
// dynamic alloca instrumentation for them as well.
!AI.isUsedWithInAlloca() &&
@@ -460,15 +459,13 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
}
Instruction *AArch64StackTagging::insertBaseTaggedPointer(
- const MapVector<AllocaInst *, AllocaInfo> &Allocas,
+ const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument,
const DominatorTree *DT) {
BasicBlock *PrologueBB = nullptr;
// Try sinking IRG as deep as possible to avoid hurting shrink wrap.
- for (auto &I : Allocas) {
- const AllocaInfo &Info = I.second;
+ for (auto &I : AllocasToInstrument) {
+ const memtag::AllocaInfo &Info = I.second;
AllocaInst *AI = Info.AI;
- if (Info.Tag < 0)
- continue;
if (!PrologueBB) {
PrologueBB = AI->getParent();
continue;
@@ -486,40 +483,6 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
return Base;
}
-void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
- const Align NewAlignment =
- max(MaybeAlign(Info.AI->getAlign()), kTagGranuleSize);
- Info.AI->setAlignment(NewAlignment);
-
- uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
- uint64_t AlignedSize = alignTo(Size, kTagGranuleSize);
- if (Size == AlignedSize)
- return;
-
- // Add padding to the alloca.
- Type *AllocatedType =
- Info.AI->isArrayAllocation()
- ? ArrayType::get(
- Info.AI->getAllocatedType(),
- cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
- : Info.AI->getAllocatedType();
- Type *PaddingType =
- ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
- Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
- auto *NewAI = new AllocaInst(
- TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
- NewAI->takeName(Info.AI);
- NewAI->setAlignment(Info.AI->getAlign());
- NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
- NewAI->setSwiftError(Info.AI->isSwiftError());
- NewAI->copyMetadata(*Info.AI);
-
- auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
- Info.AI->replaceAllUsesWith(NewPtr);
- Info.AI->eraseFromParent();
- Info.AI = NewAI;
-}
-
// FIXME: check for MTE extension
bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -532,76 +495,21 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (MergeInit)
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
- SmallVector<Instruction *, 8> RetVec;
- SmallVector<Instruction *, 4> UnrecognizedLifetimes;
-
- for (auto &BB : *F) {
- for (Instruction &I : BB) {
- if (auto *AI = dyn_cast<AllocaInst>(&I)) {
- Allocas[AI].AI = AI;
- Allocas[AI].OldAI = AI;
- continue;
- }
-
- if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
- for (Value *V : DVI->location_ops())
- if (auto *AI = dyn_cast_or_null<AllocaInst>(V))
- if (Allocas[AI].DbgVariableIntrinsics.empty() ||
- Allocas[AI].DbgVariableIntrinsics.back() != DVI)
- Allocas[AI].DbgVariableIntrinsics.push_back(DVI);
- continue;
- }
-
- auto *II = dyn_cast<IntrinsicInst>(&I);
- if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
- II->getIntrinsicID() == Intrinsic::lifetime_end)) {
- AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
- if (!AI) {
- UnrecognizedLifetimes.push_back(&I);
- continue;
- }
- if (II->getIntrinsicID() == Intrinsic::lifetime_start)
- Allocas[AI].LifetimeStart.push_back(II);
- else
- Allocas[AI].LifetimeEnd.push_back(II);
- }
-
- if (isa<ReturnInst, ResumeInst, CleanupReturnInst>(&I))
- RetVec.push_back(&I);
- }
- }
+ memtag::StackInfoBuilder SIB(
+ [this](const AllocaInst &AI) { return isInterestingAlloca(AI); });
+ for (Instruction &I : instructions(F))
+ SIB.visit(I);
+ memtag::StackInfo &SInfo = SIB.get();
- if (Allocas.empty())
+ if (SInfo.AllocasToInstrument.empty())
return false;
- int NextTag = 0;
- int NumInterestingAllocas = 0;
- for (auto &I : Allocas) {
- AllocaInfo &Info = I.second;
- assert(Info.AI);
-
- if (!isInterestingAlloca(*Info.AI)) {
- Info.Tag = -1;
- continue;
- }
-
- alignAndPadAlloca(Info);
- NumInterestingAllocas++;
- Info.Tag = NextTag;
- NextTag = (NextTag + 1) % 16;
- }
-
- if (NumInterestingAllocas == 0)
- return true;
-
std::unique_ptr<DominatorTree> DeleteDT;
DominatorTree *DT = nullptr;
if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
DT = &P->getDomTree();
- if (DT == nullptr && (NumInterestingAllocas > 1 ||
- !F->hasFnAttribute(Attribute::OptimizeNone))) {
+ if (DT == nullptr) {
DeleteDT = std::make_unique<DominatorTree>(*F);
DT = DeleteDT.get();
}
@@ -611,38 +519,57 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
PDT = &P->getPostDomTree();
- if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+ if (PDT == nullptr) {
DeletePDT = std::make_unique<PostDominatorTree>(*F);
PDT = DeletePDT.get();
}
+ std::unique_ptr<LoopInfo> DeleteLI;
+ LoopInfo *LI = nullptr;
+ if (auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>()) {
+ LI = &LIWP->getLoopInfo();
+ } else {
+ DeleteLI = std::make_unique<LoopInfo>(*DT);
+ LI = DeleteLI.get();
+ }
+
SetTagFunc =
Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
- Instruction *Base = insertBaseTaggedPointer(Allocas, DT);
+ Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT);
- for (auto &I : Allocas) {
- const AllocaInfo &Info = I.second;
+ int NextTag = 0;
+ for (auto &I : SInfo.AllocasToInstrument) {
+ memtag::AllocaInfo &Info = I.second;
+ assert(Info.AI && isInterestingAlloca(*Info.AI));
+ TrackingVH<Instruction> OldAI = Info.AI;
+ memtag::alignAndPadAlloca(Info, kTagGranuleSize);
AllocaInst *AI = Info.AI;
- if (Info.Tag < 0)
- continue;
-
+ int Tag = NextTag;
+ NextTag = (NextTag + 1) % 16;
// Replace alloca with tagp(alloca).
IRBuilder<> IRB(Info.AI->getNextNode());
Function *TagP = Intrinsic::getDeclaration(
F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()});
Instruction *TagPCall =
IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base,
- ConstantInt::get(IRB.getInt64Ty(), Info.Tag)});
+ ConstantInt::get(IRB.getInt64Ty(), Tag)});
if (Info.AI->hasName())
TagPCall->setName(Info.AI->getName() + ".tag");
Info.AI->replaceAllUsesWith(TagPCall);
TagPCall->setOperand(0, Info.AI);
- if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
- Info.LifetimeEnd.size() == 1) {
+ // Calls to functions that may return twice (e.g. setjmp) confuse the
+ // postdominator analysis, and will leave us to keep memory tagged after
+ // function return. Work around this by always untagging at every return
+ // statement if return_twice functions are called.
+ bool StandardLifetime =
+ SInfo.UnrecognizedLifetimes.empty() &&
+ memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI,
+ ClMaxLifetimes) &&
+ !SInfo.CallsReturnTwice;
+ if (StandardLifetime) {
IntrinsicInst *Start = Info.LifetimeStart[0];
- IntrinsicInst *End = Info.LifetimeEnd[0];
uint64_t Size =
cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
Size = alignTo(Size, kTagGranuleSize);
@@ -650,14 +577,16 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
if (!DT || !PDT ||
- !forAllReachableExits(*DT, *PDT, Start, Info.LifetimeEnd, RetVec,
- TagEnd))
- End->eraseFromParent();
+ !memtag::forAllReachableExits(*DT, *PDT, *LI, Start, Info.LifetimeEnd,
+ SInfo.RetVec, TagEnd)) {
+ for (auto *End : Info.LifetimeEnd)
+ End->eraseFromParent();
+ }
} else {
- uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
+ uint64_t Size = *Info.AI->getAllocationSizeInBits(*DL) / 8;
Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
tagAlloca(AI, &*IRB.GetInsertPoint(), Ptr, Size);
- for (auto &RI : RetVec) {
+ for (auto &RI : SInfo.RetVec) {
untagAlloca(AI, RI, Size);
}
// We may have inserted tag/untag outside of any lifetime interval.
@@ -670,12 +599,12 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
// Fixup debug intrinsics to point to the new alloca.
for (auto DVI : Info.DbgVariableIntrinsics)
- DVI->replaceVariableLocationOp(Info.OldAI, Info.AI);
+ DVI->replaceVariableLocationOp(OldAI, Info.AI);
}
// If we have instrumented at least one alloca, all unrecognized lifetime
- // instrinsics have to go.
- for (auto &I : UnrecognizedLifetimes)
+ // intrinsics have to go.
+ for (auto &I : SInfo.UnrecognizedLifetimes)
I->eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index cae6d65bed2d..7e91dc1b6385 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -50,7 +50,6 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
static cl::opt<bool>
ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
- cl::ZeroOrMore,
cl::desc("Apply first slot optimization for stack tagging "
"(eliminate ADDG Rt, Rn, 0, 0)."));
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8a7e20237271..15005304383d 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -21,6 +21,7 @@
#include "GISel/AArch64RegisterBankInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/AArch64TargetParser.h"
@@ -51,6 +52,16 @@ static cl::opt<bool>
static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
cl::desc("Enable the use of AA during codegen."));
+static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
+ "aarch64-insert-extract-base-cost",
+ cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
+
+unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
+ if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
+ return OverrideVectorInsertExtractBaseCost;
+ return VectorInsertExtractBaseCost;
+}
+
AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
// Determine default and user-specified characteristics
@@ -78,14 +89,17 @@ void AArch64Subtarget::initializeProperties() {
CacheLineSize = 64;
break;
case CortexA35:
- break;
case CortexA53:
case CortexA55:
PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 4;
+ MaxBytesForLoopAlignment = 8;
break;
case CortexA57:
MaxInterleaveFactor = 4;
PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 4;
+ MaxBytesForLoopAlignment = 8;
break;
case CortexA65:
PrefFunctionLogAlignment = 3;
@@ -93,6 +107,10 @@ void AArch64Subtarget::initializeProperties() {
case CortexA72:
case CortexA73:
case CortexA75:
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 4;
+ MaxBytesForLoopAlignment = 8;
+ break;
case CortexA76:
case CortexA77:
case CortexA78:
@@ -101,12 +119,21 @@ void AArch64Subtarget::initializeProperties() {
case CortexX1:
case CortexX1C:
PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 5;
+ MaxBytesForLoopAlignment = 16;
break;
case CortexA510:
+ PrefFunctionLogAlignment = 4;
+ VScaleForTuning = 1;
+ PrefLoopLogAlignment = 4;
+ MaxBytesForLoopAlignment = 8;
+ break;
case CortexA710:
case CortexX2:
PrefFunctionLogAlignment = 4;
VScaleForTuning = 1;
+ PrefLoopLogAlignment = 5;
+ MaxBytesForLoopAlignment = 16;
break;
case A64FX:
CacheLineSize = 256;
@@ -221,6 +248,12 @@ void AArch64Subtarget::initializeProperties() {
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
+ case Ampere1:
+ CacheLineSize = 64;
+ PrefFunctionLogAlignment = 6;
+ PrefLoopLogAlignment = 6;
+ MaxInterleaveFactor = 4;
+ break;
}
}
@@ -352,6 +385,8 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
if (!UseAddressTopByteIgnored)
return false;
+ if (TargetTriple.isDriverKit())
+ return true;
if (TargetTriple.isiOS()) {
return TargetTriple.getiOSVersion() >= VersionTuple(8);
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 7b2bbad30f85..c92e3e44de31 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -22,7 +22,7 @@
#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include <string>
@@ -40,6 +40,7 @@ public:
enum ARMProcFamilyEnum : uint8_t {
Others,
A64FX,
+ Ampere1,
AppleA7,
AppleA10,
AppleA11,
@@ -87,191 +88,14 @@ protected:
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily = Others;
- bool HasV8_0aOps = false;
- bool HasV8_1aOps = false;
- bool HasV8_2aOps = false;
- bool HasV8_3aOps = false;
- bool HasV8_4aOps = false;
- bool HasV8_5aOps = false;
- bool HasV8_6aOps = false;
- bool HasV8_7aOps = false;
- bool HasV8_8aOps = false;
- bool HasV9_0aOps = false;
- bool HasV9_1aOps = false;
- bool HasV9_2aOps = false;
- bool HasV9_3aOps = false;
- bool HasV8_0rOps = false;
-
- bool HasCONTEXTIDREL2 = false;
- bool HasEL2VMSA = false;
- bool HasEL3 = false;
- bool HasFPARMv8 = false;
- bool HasNEON = false;
- bool HasCrypto = false;
- bool HasDotProd = false;
- bool HasCRC = false;
- bool HasLSE = false;
- bool HasLSE2 = false;
- bool HasRAS = false;
- bool HasRDM = false;
- bool HasPerfMon = false;
- bool HasFullFP16 = false;
- bool HasFP16FML = false;
- bool HasSPE = false;
-
- bool FixCortexA53_835769 = false;
-
- // ARMv8.1 extensions
- bool HasVH = false;
- bool HasPAN = false;
- bool HasLOR = false;
-
- // ARMv8.2 extensions
- bool HasPsUAO = false;
- bool HasPAN_RWV = false;
- bool HasCCPP = false;
-
- // SVE extensions
- bool HasSVE = false;
- bool UseExperimentalZeroingPseudos = false;
- bool UseScalarIncVL = false;
-
- // Armv8.2 Crypto extensions
- bool HasSM4 = false;
- bool HasSHA3 = false;
- bool HasSHA2 = false;
- bool HasAES = false;
-
- // ARMv8.3 extensions
- bool HasPAuth = false;
- bool HasJS = false;
- bool HasCCIDX = false;
- bool HasComplxNum = false;
-
- // ARMv8.4 extensions
- bool HasNV = false;
- bool HasMPAM = false;
- bool HasDIT = false;
- bool HasTRACEV8_4 = false;
- bool HasAM = false;
- bool HasSEL2 = false;
- bool HasTLB_RMI = false;
- bool HasFlagM = false;
- bool HasRCPC_IMMO = false;
-
- bool HasLSLFast = false;
- bool HasRCPC = false;
- bool HasAggressiveFMA = false;
-
- // Armv8.5-A Extensions
- bool HasAlternativeNZCV = false;
- bool HasFRInt3264 = false;
- bool HasSpecRestrict = false;
- bool HasSSBS = false;
- bool HasSB = false;
- bool HasPredRes = false;
- bool HasCCDP = false;
- bool HasBTI = false;
- bool HasRandGen = false;
- bool HasMTE = false;
- bool HasTME = false;
-
- // Armv8.6-A Extensions
- bool HasBF16 = false;
- bool HasMatMulInt8 = false;
- bool HasMatMulFP32 = false;
- bool HasMatMulFP64 = false;
- bool HasAMVS = false;
- bool HasFineGrainedTraps = false;
- bool HasEnhancedCounterVirtualization = false;
-
- // Armv8.7-A Extensions
- bool HasXS = false;
- bool HasWFxT = false;
- bool HasHCX = false;
- bool HasLS64 = false;
-
- // Armv8.8-A Extensions
- bool HasHBC = false;
- bool HasMOPS = false;
-
- // Arm SVE2 extensions
- bool HasSVE2 = false;
- bool HasSVE2AES = false;
- bool HasSVE2SM4 = false;
- bool HasSVE2SHA3 = false;
- bool HasSVE2BitPerm = false;
-
- // Armv9-A Extensions
- bool HasRME = false;
-
- // Arm Scalable Matrix Extension (SME)
- bool HasSME = false;
- bool HasSMEF64 = false;
- bool HasSMEI64 = false;
- bool HasStreamingSVE = false;
-
- // AppleA7 system register.
- bool HasAppleA7SysReg = false;
-
- // Future architecture extensions.
- bool HasETE = false;
- bool HasTRBE = false;
- bool HasBRBE = false;
- bool HasSPE_EEF = false;
-
- // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
- bool HasZeroCycleRegMove = false;
-
- // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
- bool HasZeroCycleZeroing = false;
- bool HasZeroCycleZeroingGP = false;
- bool HasZeroCycleZeroingFPWorkaround = false;
-
- // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
- // as movi is more efficient across all cores. Newer cores can eliminate
- // fmovs early and there is no difference with movi, but this not true for
- // all implementations.
- bool HasZeroCycleZeroingFP = true;
-
- // StrictAlign - Disallow unaligned memory accesses.
- bool StrictAlign = false;
-
- // NegativeImmediates - transform instructions with negative immediates
- bool NegativeImmediates = true;
-
// Enable 64-bit vectorization in SLP.
unsigned MinVectorRegisterBitWidth = 64;
- bool OutlineAtomics = false;
- bool PredictableSelectIsExpensive = false;
- bool BalanceFPOps = false;
- bool CustomAsCheapAsMove = false;
- bool ExynosAsCheapAsMove = false;
- bool UsePostRAScheduler = false;
- bool Misaligned128StoreIsSlow = false;
- bool Paired128IsSlow = false;
- bool STRQroIsSlow = false;
- bool UseAlternateSExtLoadCVTF32Pattern = false;
- bool HasArithmeticBccFusion = false;
- bool HasArithmeticCbzFusion = false;
- bool HasCmpBccFusion = false;
- bool HasFuseAddress = false;
- bool HasFuseAES = false;
- bool HasFuseArithmeticLogic = false;
- bool HasFuseCCSelect = false;
- bool HasFuseCryptoEOR = false;
- bool HasFuseLiterals = false;
- bool DisableLatencySchedHeuristic = false;
- bool UseRSqrt = false;
- bool Force32BitJumpTables = false;
- bool UseEL1ForTP = false;
- bool UseEL2ForTP = false;
- bool UseEL3ForTP = false;
- bool AllowTaggedGlobals = false;
- bool HardenSlsRetBr = false;
- bool HardenSlsBlr = false;
- bool HardenSlsNoComdat = false;
+// Bool members corresponding to the SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool ATTRIBUTE = DEFAULT;
+#include "AArch64GenSubtargetInfo.inc"
+
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
@@ -282,7 +106,6 @@ protected:
unsigned PrefLoopLogAlignment = 0;
unsigned MaxBytesForLoopAlignment = 0;
unsigned MaxJumpTableSize = 0;
- unsigned WideningBaseCost = 0;
// ReserveXRegister[i] - X#i is not available as a general purpose register.
BitVector ReserveXRegister;
@@ -331,6 +154,11 @@ public:
unsigned MinSVEVectorSizeInBitsOverride = 0,
unsigned MaxSVEVectorSizeInBitsOverride = 0);
+// Getters for SubtargetFeatures defined in tablegen
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool GETTER() const { return ATTRIBUTE; }
+#include "AArch64GenSubtargetInfo.inc"
+
const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
@@ -351,9 +179,7 @@ public:
const RegisterBankInfo *getRegBankInfo() const override;
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
- bool enablePostRAScheduler() const override {
- return UsePostRAScheduler;
- }
+ bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
/// Returns ARM processor family.
/// Avoid this function! CPU specifics should be kept local to this class
@@ -363,30 +189,6 @@ public:
return ARMProcFamily;
}
- bool hasV8_0aOps() const { return HasV8_0aOps; }
- bool hasV8_1aOps() const { return HasV8_1aOps; }
- bool hasV8_2aOps() const { return HasV8_2aOps; }
- bool hasV8_3aOps() const { return HasV8_3aOps; }
- bool hasV8_4aOps() const { return HasV8_4aOps; }
- bool hasV8_5aOps() const { return HasV8_5aOps; }
- bool hasV9_0aOps() const { return HasV9_0aOps; }
- bool hasV9_1aOps() const { return HasV9_1aOps; }
- bool hasV9_2aOps() const { return HasV9_2aOps; }
- bool hasV9_3aOps() const { return HasV9_3aOps; }
- bool hasV8_0rOps() const { return HasV8_0rOps; }
-
- bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
- bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; }
-
- bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; }
-
- bool hasZeroCycleZeroingFPWorkaround() const {
- return HasZeroCycleZeroingFPWorkaround;
- }
-
- bool requiresStrictAlign() const { return StrictAlign; }
-
bool isXRaySupported() const override { return true; }
unsigned getMinVectorRegisterBitWidth() const {
@@ -399,63 +201,16 @@ public:
return CustomCallSavedXRegs[i];
}
bool hasCustomCallingConv() const { return CustomCallSavedXRegs.any(); }
- bool hasFPARMv8() const { return HasFPARMv8; }
- bool hasNEON() const { return HasNEON; }
- bool hasCrypto() const { return HasCrypto; }
- bool hasDotProd() const { return HasDotProd; }
- bool hasCRC() const { return HasCRC; }
- bool hasLSE() const { return HasLSE; }
- bool hasLSE2() const { return HasLSE2; }
- bool hasRAS() const { return HasRAS; }
- bool hasRDM() const { return HasRDM; }
- bool hasSM4() const { return HasSM4; }
- bool hasSHA3() const { return HasSHA3; }
- bool hasSHA2() const { return HasSHA2; }
- bool hasAES() const { return HasAES; }
- bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; }
- bool balanceFPOps() const { return BalanceFPOps; }
- bool predictableSelectIsExpensive() const {
- return PredictableSelectIsExpensive;
- }
- bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
- bool hasExynosCheapAsMoveHandling() const { return ExynosAsCheapAsMove; }
- bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
- bool isPaired128Slow() const { return Paired128IsSlow; }
- bool isSTRQroSlow() const { return STRQroIsSlow; }
- bool useAlternateSExtLoadCVTF32Pattern() const {
- return UseAlternateSExtLoadCVTF32Pattern;
- }
- bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
- bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
- bool hasCmpBccFusion() const { return HasCmpBccFusion; }
- bool hasFuseAddress() const { return HasFuseAddress; }
- bool hasFuseAES() const { return HasFuseAES; }
- bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; }
- bool hasFuseCCSelect() const { return HasFuseCCSelect; }
- bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; }
- bool hasFuseLiterals() const { return HasFuseLiterals; }
/// Return true if the CPU supports any kind of instruction fusion.
bool hasFusion() const {
return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
- hasFuseAES() || hasFuseArithmeticLogic() ||
- hasFuseCCSelect() || hasFuseLiterals();
+ hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
+ hasFuseAdrpAdd() || hasFuseLiterals();
}
- bool hardenSlsRetBr() const { return HardenSlsRetBr; }
- bool hardenSlsBlr() const { return HardenSlsBlr; }
- bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
-
- bool useEL1ForTP() const { return UseEL1ForTP; }
- bool useEL2ForTP() const { return UseEL2ForTP; }
- bool useEL3ForTP() const { return UseEL3ForTP; }
-
- bool useRSqrt() const { return UseRSqrt; }
- bool force32BitJumpTables() const { return Force32BitJumpTables; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
- unsigned getVectorInsertExtractBaseCost() const {
- return VectorInsertExtractBaseCost;
- }
+ unsigned getVectorInsertExtractBaseCost() const;
unsigned getCacheLineSize() const override { return CacheLineSize; }
unsigned getPrefetchDistance() const override { return PrefetchDistance; }
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
@@ -478,60 +233,10 @@ public:
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
- unsigned getWideningBaseCost() const { return WideningBaseCost; }
-
- bool useExperimentalZeroingPseudos() const {
- return UseExperimentalZeroingPseudos;
- }
-
- bool useScalarIncVL() const { return UseScalarIncVL; }
-
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
- bool hasPerfMon() const { return HasPerfMon; }
- bool hasFullFP16() const { return HasFullFP16; }
- bool hasFP16FML() const { return HasFP16FML; }
- bool hasSPE() const { return HasSPE; }
- bool hasLSLFast() const { return HasLSLFast; }
- bool hasSVE() const { return HasSVE; }
- bool hasSVE2() const { return HasSVE2; }
- bool hasRCPC() const { return HasRCPC; }
- bool hasAggressiveFMA() const { return HasAggressiveFMA; }
- bool hasAlternativeNZCV() const { return HasAlternativeNZCV; }
- bool hasFRInt3264() const { return HasFRInt3264; }
- bool hasSpecRestrict() const { return HasSpecRestrict; }
- bool hasSSBS() const { return HasSSBS; }
- bool hasSB() const { return HasSB; }
- bool hasPredRes() const { return HasPredRes; }
- bool hasCCDP() const { return HasCCDP; }
- bool hasBTI() const { return HasBTI; }
- bool hasRandGen() const { return HasRandGen; }
- bool hasMTE() const { return HasMTE; }
- bool hasTME() const { return HasTME; }
- // Arm SVE2 extensions
- bool hasSVE2AES() const { return HasSVE2AES; }
- bool hasSVE2SM4() const { return HasSVE2SM4; }
- bool hasSVE2SHA3() const { return HasSVE2SHA3; }
- bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
- bool hasMatMulInt8() const { return HasMatMulInt8; }
- bool hasMatMulFP32() const { return HasMatMulFP32; }
- bool hasMatMulFP64() const { return HasMatMulFP64; }
-
- // Armv8.6-A Extensions
- bool hasBF16() const { return HasBF16; }
- bool hasFineGrainedTraps() const { return HasFineGrainedTraps; }
- bool hasEnhancedCounterVirtualization() const {
- return HasEnhancedCounterVirtualization;
- }
-
- // Arm Scalable Matrix Extension (SME)
- bool hasSME() const { return HasSME; }
- bool hasSMEF64() const { return HasSMEF64; }
- bool hasSMEI64() const { return HasSMEI64; }
- bool hasStreamingSVE() const { return HasStreamingSVE; }
-
bool isLittleEndian() const { return IsLittle; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
@@ -552,42 +257,6 @@ public:
bool useAA() const override;
- bool outlineAtomics() const { return OutlineAtomics; }
-
- bool hasVH() const { return HasVH; }
- bool hasPAN() const { return HasPAN; }
- bool hasLOR() const { return HasLOR; }
-
- bool hasPsUAO() const { return HasPsUAO; }
- bool hasPAN_RWV() const { return HasPAN_RWV; }
- bool hasCCPP() const { return HasCCPP; }
-
- bool hasPAuth() const { return HasPAuth; }
- bool hasJS() const { return HasJS; }
- bool hasCCIDX() const { return HasCCIDX; }
- bool hasComplxNum() const { return HasComplxNum; }
-
- bool hasNV() const { return HasNV; }
- bool hasMPAM() const { return HasMPAM; }
- bool hasDIT() const { return HasDIT; }
- bool hasTRACEV8_4() const { return HasTRACEV8_4; }
- bool hasAM() const { return HasAM; }
- bool hasAMVS() const { return HasAMVS; }
- bool hasXS() const { return HasXS; }
- bool hasWFxT() const { return HasWFxT; }
- bool hasHCX() const { return HasHCX; }
- bool hasLS64() const { return HasLS64; }
- bool hasSEL2() const { return HasSEL2; }
- bool hasTLB_RMI() const { return HasTLB_RMI; }
- bool hasFlagM() const { return HasFlagM; }
- bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
- bool hasEL2VMSA() const { return HasEL2VMSA; }
- bool hasEL3() const { return HasEL3; }
- bool hasHBC() const { return HasHBC; }
- bool hasMOPS() const { return HasMOPS; }
-
- bool fixCortexA53_835769() const { return FixCortexA53_835769; }
-
bool addrSinkUsingGEPs() const override {
// Keeping GEPs inbounds is important for exploiting AArch64
// addressing-modes in ILP32 mode.
@@ -623,8 +292,6 @@ public:
bool enableEarlyIfConversion() const override;
- bool enableAdvancedRASplitCost() const override { return false; }
-
std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
bool isCallingConvWin64(CallingConv::ID CC) const {
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index cce5813fe6e9..f3788175c48d 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,23 +18,23 @@ include "llvm/TableGen/SearchableTable.td"
//===----------------------------------------------------------------------===//
def HasCCPP : Predicate<"Subtarget->hasCCPP()">,
- AssemblerPredicate<(all_of FeatureCCPP), "ccpp">;
+ AssemblerPredicateWithAll<(all_of FeatureCCPP), "ccpp">;
def HasPAN : Predicate<"Subtarget->hasPAN()">,
- AssemblerPredicate<(all_of FeaturePAN),
+ AssemblerPredicateWithAll<(all_of FeaturePAN),
"ARM v8.1 Privileged Access-Never extension">;
def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">,
- AssemblerPredicate<(all_of FeaturePsUAO),
+ AssemblerPredicateWithAll<(all_of FeaturePsUAO),
"ARM v8.2 UAO PState extension (psuao)">;
def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
- AssemblerPredicate<(all_of FeaturePAN_RWV),
+ AssemblerPredicateWithAll<(all_of FeaturePAN_RWV),
"ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
def HasCONTEXTIDREL2
: Predicate<"Subtarget->hasCONTEXTIDREL2()">,
- AssemblerPredicate<(all_of FeatureCONTEXTIDREL2),
+ AssemblerPredicateWithAll<(all_of FeatureCONTEXTIDREL2),
"Target contains CONTEXTIDR_EL2 RW operand">;
//===----------------------------------------------------------------------===//
@@ -631,6 +631,7 @@ def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>;
def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>;
def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>;
def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>;
def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>;
@@ -977,7 +978,6 @@ def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>;
def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>;
def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>;
def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>;
-def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>;
def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>;
def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4af28fc070dd..3f9795f5198b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,6 +12,7 @@
#include "AArch64TargetMachine.h"
#include "AArch64.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MachineScheduler.h"
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetObjectFile.h"
@@ -21,7 +22,9 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/CFIFixup.h"
#include "llvm/CodeGen/CSEConfigBase.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -31,6 +34,7 @@
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -59,6 +63,11 @@ static cl::opt<bool>
cl::desc("Enable the conditional branch tuning pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableAArch64CopyPropagation(
+ "aarch64-enable-copy-propagation",
+ cl::desc("Enable the copy propagation with AArch64 copy instr"),
+ cl::init(true), cl::Hidden);
+
static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
@@ -265,7 +274,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
// On ELF platforms the default static relocation model has a smart enough
// linker to cope with referencing external symbols defined in a shared
// library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
- if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+ if (!RM || *RM == Reloc::DynamicNoPIC)
return Reloc::Static;
return *RM;
}
@@ -354,6 +363,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
// AArch64 supports the debug entry values.
setSupportsDebugEntryValues(true);
+
+ // AArch64 supports fixing up the DWARF unwind information.
+ if (!getMCAsmInfo()->usesWindowsCFI())
+ setCFIFixup(true);
}
AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -379,7 +392,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
if (VScaleRangeAttr.isValid()) {
Optional<unsigned> VScaleMax = VScaleRangeAttr.getVScaleRangeMax();
MinSVEVectorSize = VScaleRangeAttr.getVScaleRangeMin() * 128;
- MaxSVEVectorSize = VScaleMax ? VScaleMax.getValue() * 128 : 0;
+ MaxSVEVectorSize = VScaleMax ? *VScaleMax * 128 : 0;
} else {
MinSVEVectorSize = SVEVectorBitsMinOpt;
MaxSVEVectorSize = SVEVectorBitsMaxOpt;
@@ -468,15 +481,17 @@ public:
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {
const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+ ScheduleDAGMI *DAG =
+ new ScheduleDAGMI(C, std::make_unique<AArch64PostRASchedStrategy>(C),
+ /* RemoveKillFlags=*/true);
if (ST.hasFusion()) {
// Run the Macro Fusion after RA again since literals are expanded from
// pseudos then (v. addPreSched2()).
- ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
DAG->addMutation(createAArch64MacroFusionDAGMutation());
return DAG;
}
- return nullptr;
+ return DAG;
}
void addIRPasses() override;
@@ -504,7 +519,7 @@ public:
} // end anonymous namespace
TargetTransformInfo
-AArch64TargetMachine::getTargetTransformInfo(const Function &F) {
+AArch64TargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(AArch64TTIImpl(this, F));
}
@@ -531,6 +546,7 @@ void AArch64PassConfig::addIRPasses() {
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
+ .convertSwitchRangeToICmp(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
@@ -574,6 +590,9 @@ void AArch64PassConfig::addIRPasses() {
// Add Control Flow Guard checks.
if (TM->getTargetTriple().isOSWindows())
addPass(createCFGuardCheckPass());
+
+ if (TM->Options.JMCInstrument)
+ addPass(createJMCInstrumenterPass());
}
// Pass Pipeline Configuration
@@ -759,6 +778,10 @@ void AArch64PassConfig::addPreEmitPass() {
if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt)
addPass(createAArch64LoadStoreOptimizationPass());
+ if (TM->getOptLevel() >= CodeGenOpt::Aggressive &&
+ EnableAArch64CopyPropagation)
+ addPass(createMachineCopyPropagationPass(true));
+
addPass(createAArch64A53Fix835769());
if (EnableBranchTargets)
@@ -804,8 +827,7 @@ AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
bool AArch64TargetMachine::parseMachineFunctionInfo(
const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
SMDiagnostic &Error, SMRange &SourceRange) const {
- const auto &YamlMFI =
- reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI);
+ const auto &YamlMFI = static_cast<const yaml::AArch64FunctionInfo &>(MFI);
MachineFunction &MF = PFS.MF;
MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI);
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 7d314bce99b1..beb109502ff9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -41,7 +41,7 @@ public:
// Pass Pipeline Configuration
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
TargetLoweringObjectFile* getObjFileLowering() const override {
return TLOF.get();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b2ffdf949d8b..41c7a8c5042f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -8,6 +8,7 @@
#include "AArch64TargetTransformInfo.h"
#include "AArch64ExpandImm.h"
+#include "AArch64PerfectShuffle.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -15,8 +16,8 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
@@ -50,6 +51,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const {
+ assert(K != TargetTransformInfo::RGK_Scalar);
+ return K == TargetTransformInfo::RGK_FixedWidthVector;
+}
+
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
@@ -370,6 +377,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return Entry->Cost;
break;
}
+ case Intrinsic::fptosi_sat:
+ case Intrinsic::fptoui_sat: {
+ if (ICA.getArgTypes().empty())
+ break;
+ bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+ auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
+ EVT MTy = TLI->getValueType(DL, RetTy);
+ // Check for the legal types, which are where the size of the input and the
+ // output are the same, or we are using cvt f64->i32 or f32->i64.
+ if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
+ LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
+ LT.second == MVT::v2f64) &&
+ (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+ (LT.second == MVT::f64 && MTy == MVT::i32) ||
+ (LT.second == MVT::f32 && MTy == MVT::i64)))
+ return LT.first;
+ // Similarly for fp16 sizes
+ if (ST->hasFullFP16() &&
+ ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+ ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+ (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+ return LT.first;
+
+ // Otherwise we use a legal convert followed by a min+max
+ if ((LT.second.getScalarType() == MVT::f32 ||
+ LT.second.getScalarType() == MVT::f64 ||
+ (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+ LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
+ Type *LegalTy =
+ Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
+ if (LT.second.isVector())
+ LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
+ InstructionCost Cost = 1;
+ IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
+ LegalTy, {LegalTy, LegalTy});
+ Cost += getIntrinsicInstrCost(Attrs1, CostKind);
+ IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
+ LegalTy, {LegalTy, LegalTy});
+ Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+ return LT.first * Cost;
+ }
+ break;
+ }
default:
break;
}
@@ -525,6 +575,14 @@ static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
return IC.replaceInstUsesWith(II, EarliestReplacement);
}
+static Optional<Instruction *> instCombineSVESel(InstCombiner &IC,
+ IntrinsicInst &II) {
+ IRBuilder<> Builder(&II);
+ auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
+ II.getOperand(2));
+ return IC.replaceInstUsesWith(II, Select);
+}
+
static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
IntrinsicInst &II) {
IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
@@ -594,8 +652,7 @@ static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
return None;
auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
- if (!VecIns ||
- VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+ if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
return None;
// Where the vector insert is a fixed constant vector insert into undef at
@@ -862,12 +919,14 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
if (isAllActivePredicate(Pred)) {
LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
+ Load->copyMetadata(II);
return IC.replaceInstUsesWith(II, Load);
}
CallInst *MaskedLoad =
Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
Pred, ConstantAggregateZero::get(VecTy));
+ MaskedLoad->copyMetadata(II);
return IC.replaceInstUsesWith(II, MaskedLoad);
}
@@ -883,12 +942,14 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
if (isAllActivePredicate(Pred)) {
- Builder.CreateStore(VecOp, VecPtr);
+ StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
+ Store->copyMetadata(II);
return IC.eraseInstFromFunction(II);
}
- Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
- Pred);
+ CallInst *MaskedStore = Builder.CreateMaskedStore(
+ VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
+ MaskedStore->copyMetadata(II);
return IC.eraseInstFromFunction(II);
}
@@ -1069,7 +1130,6 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
Value *BasePtr = II.getOperand(1);
Value *Index = II.getOperand(2);
Type *Ty = II.getType();
- Type *BasePtrTy = BasePtr->getType();
Value *PassThru = ConstantAggregateZero::get(Ty);
// Contiguous gather => masked load.
@@ -1085,8 +1145,8 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
Type *VecPtrTy = PointerType::getUnqual(Ty);
- Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
- IndexBase);
+ Value *Ptr = Builder.CreateGEP(
+ cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
CallInst *MaskedLoad =
Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
@@ -1104,10 +1164,9 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
Value *BasePtr = II.getOperand(2);
Value *Index = II.getOperand(3);
Type *Ty = Val->getType();
- Type *BasePtrTy = BasePtr->getType();
// Contiguous scatter => masked store.
- // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
+ // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
Value *IndexBase;
if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
@@ -1118,8 +1177,8 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
Align Alignment =
BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
- Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
- IndexBase);
+ Value *Ptr = Builder.CreateGEP(
+ cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
Type *VecPtrTy = PointerType::getUnqual(Ty);
Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
@@ -1165,6 +1224,52 @@ static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
return None;
}
+static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *A = II.getArgOperand(0);
+ Value *B = II.getArgOperand(1);
+ if (A == B)
+ return IC.replaceInstUsesWith(II, A);
+
+ return None;
+}
+
+static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
+ IntrinsicInst &II) {
+ IRBuilder<> Builder(&II);
+ Value *Pred = II.getOperand(0);
+ Value *Vec = II.getOperand(1);
+ Value *Shift = II.getOperand(2);
+
+ // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
+ Value *AbsPred, *MergedValue;
+ if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
+ m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
+ !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
+ m_Value(MergedValue), m_Value(AbsPred), m_Value())))
+
+ return None;
+
+ // Transform is valid if any of the following are true:
+ // * The ABS merge value is an undef or non-negative
+ // * The ABS predicate is all active
+ // * The ABS predicate and the SRSHL predicates are the same
+ if (!isa<UndefValue>(MergedValue) &&
+ !match(MergedValue, m_NonNegative()) &&
+ AbsPred != Pred && !isAllActivePredicate(AbsPred))
+ return None;
+
+ // Only valid when the shift amount is non-negative, otherwise the rounding
+ // behaviour of SRSHL cannot be ignored.
+ if (!match(Shift, m_NonNegative()))
+ return None;
+
+ auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
+ {Pred, Vec, Shift});
+
+ return IC.replaceInstUsesWith(II, LSL);
+}
+
Optional<Instruction *>
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const {
@@ -1172,6 +1277,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+ case Intrinsic::aarch64_neon_fmaxnm:
+ case Intrinsic::aarch64_neon_fminnm:
+ return instCombineMaxMinNM(IC, II);
case Intrinsic::aarch64_sve_convert_from_svbool:
return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
@@ -1227,6 +1335,10 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEST1(IC, II, DL);
case Intrinsic::aarch64_sve_sdiv:
return instCombineSVESDIV(IC, II);
+ case Intrinsic::aarch64_sve_sel:
+ return instCombineSVESel(IC, II);
+ case Intrinsic::aarch64_sve_srshl:
+ return instCombineSVESrshl(IC, II);
}
return None;
@@ -1262,7 +1374,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
// A helper that returns a vector type from the given type. The number of
- // elements in type Ty determine the vector width.
+ // elements in type Ty determines the vector width.
auto toVectorTy = [&](Type *ArgTy) {
return VectorType::get(ArgTy->getScalarType(),
cast<VectorType>(DstTy)->getElementCount());
@@ -1277,26 +1389,32 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
// instructions.
//
- // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+ // TODO: Add additional widening operations (e.g., shl, etc.) once we
// verify that their extending operands are eliminated during code
// generation.
switch (Opcode) {
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ case Instruction::Mul: // SMULL(2), UMULL(2)
break;
default:
return false;
}
// To be a widening instruction (either the "wide" or "long" versions), the
- // second operand must be a sign- or zero extend having a single user. We
- // only consider extends having a single user because they may otherwise not
- // be eliminated.
+ // second operand must be a sign- or zero extend.
if (Args.size() != 2 ||
- (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
- !Args[1]->hasOneUse())
+ (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
return false;
auto *Extend = cast<CastInst>(Args[1]);
+ auto *Arg0 = dyn_cast<CastInst>(Args[0]);
+
+ // A mul only has a mull version (not like addw). Both operands need to be
+ // extending and the same type.
+ if (Opcode == Instruction::Mul &&
+ (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
+ Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
+ return false;
// Legalize the destination type and ensure it can be used in a widening
// operation.
@@ -1334,7 +1452,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
// If the cast is observable, and it is used by a widening instruction (e.g.,
// uaddl, saddw, etc.), it may be free.
- if (I && I->hasOneUse()) {
+ if (I && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
@@ -1606,6 +1724,36 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost);
+ static const TypeConversionCostTblEntry FP16Tbl[] = {
+ {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
+ {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
+ {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
+ {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
+ {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
+ {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
+ {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
+ {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
+ {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
+ {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
+ {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
+ {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
+ {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
+ {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
+ {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
+ {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
+ {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
+ {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
+ {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
+ {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
+ {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
+ {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
+ };
+
+ if (ST->hasFullFP16())
+ if (const auto *Entry = ConvertCostTableLookup(
+ FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost);
+
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
@@ -1723,24 +1871,12 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-
- // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
- // add in the widening overhead specified by the sub-target. Since the
- // extends feeding widening instructions are performed automatically, they
- // aren't present in the generated code and have a zero cost. By adding a
- // widening overhead here, we attach the total cost of the combined operation
- // to the widening instruction.
- InstructionCost Cost = 0;
- if (isWideningInstruction(Ty, Opcode, Args))
- Cost += ST->getWideningBaseCost();
-
int ISD = TLI->InstructionOpcodeToISD(Opcode);
switch (ISD) {
default:
- return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
- Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info, Opd1PropInfo, Opd2PropInfo);
case ISD::SDIV:
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -1748,26 +1884,22 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
+ InstructionCost Cost = getArithmeticInstrCost(
+ Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info,
+ Opd2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
- Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
+ Cost += getArithmeticInstrCost(
+ Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info,
+ Opd2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return Cost;
}
LLVM_FALLTHROUGH;
- case ISD::UDIV:
+ case ISD::UDIV: {
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
auto VT = TLI->getValueType(DL, Ty);
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
@@ -1787,9 +1919,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
}
}
- Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
- Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ InstructionCost Cost = BaseT::getArithmeticInstrCost(
+ Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
if (Ty->isVectorTy()) {
// On AArch64, vector divisions are not supported natively and are
// expanded into scalar divisions of each pair of elements.
@@ -1804,27 +1935,31 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
Cost += Cost;
}
return Cost;
-
+ }
case ISD::MUL:
- if (LT.second != MVT::v2i64)
- return (Cost + 1) * LT.first;
// Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
// as elements are extracted from the vectors and the muls scalarized.
// As getScalarizationOverhead is a bit too pessimistic, we estimate the
// cost for a i64 vector directly here, which is:
- // - four i64 extracts,
- // - two i64 inserts, and
- // - two muls.
- // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
- // LT.first = 2 the cost is 16.
- return LT.first * 8;
+ // - four 2-cost i64 extracts,
+ // - two 2-cost i64 inserts, and
+ // - two 1-cost muls.
+ // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
+ // LT.first = 2 the cost is 28. If both operands are extensions it will not
+ // need to scalarize so the cost can be cheaper (smull or umull).
+ if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+ return LT.first;
+ return LT.first * 14;
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
case ISD::AND:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::SHL:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
- return (Cost + 1) * LT.first;
+ return LT.first;
case ISD::FADD:
case ISD::FSUB:
@@ -1834,11 +1969,10 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// These nodes are marked as 'custom' just to lower them to SVE.
// We know said lowering will incur no additional cost.
if (!Ty->getScalarType()->isFP128Ty())
- return (Cost + 2) * LT.first;
+ return 2 * LT.first;
- return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
- Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info, Opd1PropInfo, Opd2PropInfo);
}
}
@@ -1946,6 +2080,10 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return Options;
}
+bool AArch64TTIImpl::prefersVectorizedAddressing() const {
+ return ST->hasSVE();
+}
+
InstructionCost
AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,
@@ -2559,11 +2697,97 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ // If we have a Mask, and the LT is being legalized somehow, split the Mask
+ // into smaller vectors and sum the cost of each shuffle.
+ if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
+ Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
+ cast<FixedVectorType>(Tp)->getNumElements() >
+ LT.second.getVectorNumElements() &&
+ !Index && !SubTp) {
+ unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
+ assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
+ unsigned LTNumElts = LT.second.getVectorNumElements();
+ unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
+ VectorType *NTp =
+ VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
+ InstructionCost Cost;
+ for (unsigned N = 0; N < NumVecs; N++) {
+ SmallVector<int> NMask;
+ // Split the existing mask into chunks of size LTNumElts. Track the source
+ // sub-vectors to ensure the result has at most 2 inputs.
+ unsigned Source1, Source2;
+ unsigned NumSources = 0;
+ for (unsigned E = 0; E < LTNumElts; E++) {
+ int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
+ : UndefMaskElem;
+ if (MaskElt < 0) {
+ NMask.push_back(UndefMaskElem);
+ continue;
+ }
+
+ // Calculate which source from the input this comes from and whether it
+ // is new to us.
+ unsigned Source = MaskElt / LTNumElts;
+ if (NumSources == 0) {
+ Source1 = Source;
+ NumSources = 1;
+ } else if (NumSources == 1 && Source != Source1) {
+ Source2 = Source;
+ NumSources = 2;
+ } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
+ NumSources++;
+ }
+
+ // Add to the new mask. For the NumSources>2 case these are not correct,
+ // but are only used for the modular lane number.
+ if (Source == Source1)
+ NMask.push_back(MaskElt % LTNumElts);
+ else if (Source == Source2)
+ NMask.push_back(MaskElt % LTNumElts + LTNumElts);
+ else
+ NMask.push_back(MaskElt % LTNumElts);
+ }
+ // If the sub-mask has at most 2 input sub-vectors then re-cost it using
+ // getShuffleCost. If not then cost it using the worst case.
+ if (NumSources <= 2)
+ Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
+ : TTI::SK_PermuteTwoSrc,
+ NTp, NMask, 0, nullptr, Args);
+ else if (any_of(enumerate(NMask), [&](const auto &ME) {
+ return ME.value() % LTNumElts == ME.index();
+ }))
+ Cost += LTNumElts - 1;
+ else
+ Cost += LTNumElts;
+ }
+ return Cost;
+ }
+
Kind = improveShuffleKindFromMask(Kind, Mask);
+
+ // Check for broadcast loads.
+ if (Kind == TTI::SK_Broadcast) {
+ bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
+ if (IsLoad && LT.second.isVector() &&
+ isLegalBroadcastLoad(Tp->getElementType(),
+ LT.second.getVectorElementCount()))
+ return 0; // broadcast is handled by ld1r
+ }
+
+ // If we have 4 elements for the shuffle and a Mask, get the cost straight
+ // from the perfect shuffle tables.
+ if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
+ (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
+ all_of(Mask, [](int E) { return E < 8; }))
+ return getPerfectShuffleCost(Mask);
+
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
Kind == TTI::SK_Reverse) {
+
static const CostTblEntry ShuffleTbl[] = {
// Broadcast shuffle kinds can be performed with 'dup'.
{ TTI::SK_Broadcast, MVT::v8i8, 1 },
@@ -2618,6 +2842,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
{ TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
{ TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
+ { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT
+ { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT
+ { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT
+ { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64
+ { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64
+ { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64
// Broadcast shuffle kinds for scalable vectors
{ TTI::SK_Broadcast, MVT::nxv16i8, 1 },
{ TTI::SK_Broadcast, MVT::nxv8i16, 1 },
@@ -2655,11 +2885,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_Reverse, MVT::nxv4i1, 1 },
{ TTI::SK_Reverse, MVT::nxv2i1, 1 },
};
- std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
}
+
if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
return getSpliceCost(Tp, Index);
+
+ // Inserting a subvector can often be done with either a D, S or H register
+ // move, so long as the inserted vector is "aligned".
+ if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
+ LT.second.getSizeInBits() <= 128 && SubTp) {
+ std::pair<InstructionCost, MVT> SubLT =
+ TLI->getTypeLegalizationCost(DL, SubTp);
+ if (SubLT.second.isVector()) {
+ int NumElts = LT.second.getVectorNumElements();
+ int NumSubElts = SubLT.second.getVectorNumElements();
+ if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+ return SubLT.first;
+ }
+ }
+
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6029b9f2445..d0aacb457a39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,8 @@ public:
return ST->getVScaleForTuning();
}
+ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+
/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.
/// For scalable vectors this currently takes the most pessimistic view based
@@ -148,6 +150,8 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
+ bool prefersVectorizedAddressing() const;
+
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind);
@@ -278,6 +282,23 @@ public:
return isLegalMaskedGatherScatter(DataType);
}
+ bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const {
+ // Return true if we can generate a `ld1r` splat load instruction.
+ if (!ST->hasNEON() || NumElements.isScalable())
+ return false;
+ switch (unsigned ElementBits = ElementTy->getScalarSizeInBits()) {
+ case 8:
+ case 16:
+ case 32:
+ case 64: {
+ // We accept bit-widths >= 64bits and elements {8,16,32,64} bits.
+ unsigned VectorBits = NumElements.getFixedValue() * ElementBits;
+ return VectorBits >= 64;
+ }
+ }
+ return false;
+ }
+
bool isLegalNTStore(Type *DataType, Align Alignment) {
// NOTE: The logic below is mostly geared towards LV, which calls it with
// vectors with 2 elements. We might want to improve that, if other
@@ -330,7 +351,8 @@ public:
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args = None);
/// @}
};
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 33ed7ae9780e..ade23f643538 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -127,7 +127,7 @@ private:
return Prefix;
}
- PrefixInfo() : Active(false), Predicated(false) {}
+ PrefixInfo() = default;
bool isActive() const { return Active; }
bool isPredicated() const { return Predicated; }
unsigned getElementSize() const {
@@ -141,8 +141,8 @@ private:
}
private:
- bool Active;
- bool Predicated;
+ bool Active = false;
+ bool Predicated = false;
unsigned ElementSize;
unsigned Dst;
unsigned Pg;
@@ -157,7 +157,8 @@ private:
bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
- AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+ AArch64CC::CondCode parseCondCodeString(StringRef Cond,
+ std::string &Suggestion);
bool parseCondCode(OperandVector &Operands, bool invertCondCode);
unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
bool parseRegister(OperandVector &Operands);
@@ -189,6 +190,7 @@ private:
bool parseDirectiveUnreq(SMLoc L);
bool parseDirectiveCFINegateRAState();
bool parseDirectiveCFIBKeyFrame();
+ bool parseDirectiveCFIMTETaggedFrame();
bool parseDirectiveVariantPCS(SMLoc L);
@@ -2425,7 +2427,7 @@ static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
}
static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
- return parseVectorKind(Suffix, VectorKind).hasValue();
+ return parseVectorKind(Suffix, VectorKind).has_value();
}
static unsigned matchSVEDataVectorRegName(StringRef Name) {
@@ -2758,8 +2760,8 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
}
auto PRFM = LookupByEncoding(MCE->getValue());
- Operands.push_back(AArch64Operand::CreatePrefetch(
- prfop, PRFM.getValueOr(""), S, getContext()));
+ Operands.push_back(AArch64Operand::CreatePrefetch(prfop, PRFM.value_or(""),
+ S, getContext()));
return MatchOperand_Success;
}
@@ -3029,8 +3031,10 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
return MatchOperand_Success;
}
-/// parseCondCodeString - Parse a Condition Code string.
-AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+/// parseCondCodeString - Parse a Condition Code string, optionally returning a
+/// suggestion to help common typos.
+AArch64CC::CondCode
+AArch64AsmParser::parseCondCodeString(StringRef Cond, std::string &Suggestion) {
AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
.Case("eq", AArch64CC::EQ)
.Case("ne", AArch64CC::NE)
@@ -3053,7 +3057,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
.Default(AArch64CC::Invalid);
if (CC == AArch64CC::Invalid &&
- getSTI().getFeatureBits()[AArch64::FeatureSVE])
+ getSTI().getFeatureBits()[AArch64::FeatureSVE]) {
CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
.Case("none", AArch64CC::EQ)
.Case("any", AArch64CC::NE)
@@ -3067,6 +3071,9 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
.Case("tstop", AArch64CC::LT)
.Default(AArch64CC::Invalid);
+ if (CC == AArch64CC::Invalid && Cond.lower() == "nfirst")
+ Suggestion = "nfrst";
+ }
return CC;
}
@@ -3078,9 +3085,14 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
StringRef Cond = Tok.getString();
- AArch64CC::CondCode CC = parseCondCodeString(Cond);
- if (CC == AArch64CC::Invalid)
- return TokError("invalid condition code");
+ std::string Suggestion;
+ AArch64CC::CondCode CC = parseCondCodeString(Cond, Suggestion);
+ if (CC == AArch64CC::Invalid) {
+ std::string Msg = "invalid condition code";
+ if (!Suggestion.empty())
+ Msg += ", did you mean " + Suggestion + "?";
+ return TokError(Msg);
+ }
Lex(); // Eat identifier token.
if (invertCondCode) {
@@ -3910,7 +3922,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
const MCRegisterInfo *RI = getContext().getRegisterInfo();
unsigned PrevReg = FirstReg;
- unsigned Count = 1;
SmallSet<unsigned, 8> DRegs;
AArch64Operand::ComputeRegsForAlias(FirstReg, DRegs, ElementWidth);
@@ -3942,7 +3953,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
}
PrevReg = Reg;
- ++Count;
}
if (parseToken(AsmToken::RCurly, "'}' expected"))
@@ -4545,9 +4555,14 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
(Head.data() - Name.data()));
- AArch64CC::CondCode CC = parseCondCodeString(Head);
- if (CC == AArch64CC::Invalid)
- return Error(SuffixLoc, "invalid condition code");
+ std::string Suggestion;
+ AArch64CC::CondCode CC = parseCondCodeString(Head, Suggestion);
+ if (CC == AArch64CC::Invalid) {
+ std::string Msg = "invalid condition code";
+ if (!Suggestion.empty())
+ Msg += ", did you mean " + Suggestion + "?";
+ return Error(SuffixLoc, Msg);
+ }
Operands.push_back(AArch64Operand::CreateToken(".", SuffixLoc, getContext(),
/*IsSuffix=*/true));
Operands.push_back(
@@ -6024,6 +6039,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
parseDirectiveCFINegateRAState();
else if (IDVal == ".cfi_b_key_frame")
parseDirectiveCFIBKeyFrame();
+ else if (IDVal == ".cfi_mte_tagged_frame")
+ parseDirectiveCFIMTETaggedFrame();
else if (IDVal == ".arch_extension")
parseDirectiveArchExtension(Loc);
else if (IDVal == ".variant_pcs")
@@ -6198,12 +6215,11 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
if (Extension.Features.none())
report_fatal_error("unsupported architectural extension: " + Name);
- FeatureBitset ToggleFeatures = EnableFeature
- ? (~Features & Extension.Features)
- : ( Features & Extension.Features);
- FeatureBitset Features =
- ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
- setAvailableFeatures(Features);
+ FeatureBitset ToggleFeatures =
+ EnableFeature
+ ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+ : STI.ToggleFeature(Features & Extension.Features);
+ setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
break;
}
}
@@ -6217,8 +6233,7 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
StringRef Name = getParser().parseStringToEndOfStatement().trim();
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.arch_extension' directive"))
+ if (parseEOL())
return true;
bool EnableFeature = true;
@@ -6236,12 +6251,11 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
if (Extension.Features.none())
return Error(ExtLoc, "unsupported architectural extension: " + Name);
- FeatureBitset ToggleFeatures = EnableFeature
- ? (~Features & Extension.Features)
- : (Features & Extension.Features);
- FeatureBitset Features =
- ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
- setAvailableFeatures(Features);
+ FeatureBitset ToggleFeatures =
+ EnableFeature
+ ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+ : STI.ToggleFeature(Features & Extension.Features);
+ setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
return false;
}
@@ -6281,7 +6295,6 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
- FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
// Advance source location past '+'.
CurLoc = incrementLoc(CurLoc, 1);
@@ -6301,12 +6314,12 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
if (Extension.Features.none())
report_fatal_error("unsupported architectural extension: " + Name);
- FeatureBitset ToggleFeatures = EnableFeature
- ? (~Features & Extension.Features)
- : ( Features & Extension.Features);
- FeatureBitset Features =
- ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
- setAvailableFeatures(Features);
+ FeatureBitset Features = STI.getFeatureBits();
+ FeatureBitset ToggleFeatures =
+ EnableFeature
+ ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+ : STI.ToggleFeature(Features & Extension.Features);
+ setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
FoundExtension = true;
break;
@@ -6401,12 +6414,10 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
if (Idx + 1 == NbArgs)
break;
- if (parseToken(AsmToken::Comma,
- "unexpected token in '" + Twine(IDVal) + "' directive"))
+ if (parseComma())
return true;
}
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected token in '" + Twine(IDVal) + "' directive"))
+ if (parseEOL())
return true;
getStreamer().emitLOHDirective((MCLOHType)Kind, Args);
@@ -6416,7 +6427,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
/// parseDirectiveLtorg
/// ::= .ltorg | .pool
bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
- if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ if (parseEOL())
return true;
getTargetStreamer().emitCurrentConstantPool();
return false;
@@ -6474,8 +6485,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
return Error(SRegLoc, "register name or alias expected");
// Shouldn't be anything else.
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected input in .req directive"))
+ if (parseEOL())
return true;
auto pair = std::make_pair(RegisterKind, (unsigned) RegNum);
@@ -6496,7 +6506,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
}
bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
- if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ if (parseEOL())
return true;
getStreamer().emitCFINegateRAState();
return false;
@@ -6505,31 +6515,31 @@ bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
/// parseDirectiveCFIBKeyFrame
/// ::= .cfi_b_key
bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
- if (parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.cfi_b_key_frame'"))
+ if (parseEOL())
return true;
getStreamer().emitCFIBKeyFrame();
return false;
}
+/// parseDirectiveCFIMTETaggedFrame
+/// ::= .cfi_mte_tagged_frame
+bool AArch64AsmParser::parseDirectiveCFIMTETaggedFrame() {
+ if (parseEOL())
+ return true;
+ getStreamer().emitCFIMTETaggedFrame();
+ return false;
+}
+
/// parseDirectiveVariantPCS
/// ::= .variant_pcs symbolname
bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
- const AsmToken &Tok = getTok();
- if (Tok.isNot(AsmToken::Identifier))
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
return TokError("expected symbol name");
-
- StringRef SymbolName = Tok.getIdentifier();
-
- MCSymbol *Sym = getContext().lookupSymbol(SymbolName);
- if (!Sym)
- return TokError("unknown symbol");
-
- Lex(); // Eat the symbol
-
if (parseEOL())
return true;
- getTargetStreamer().emitDirectiveVariantPCS(Sym);
+ getTargetStreamer().emitDirectiveVariantPCS(
+ getContext().getOrCreateSymbol(Name));
return false;
}
@@ -6880,7 +6890,7 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
// as a literal token.
if (Op.isTokenEqual("za"))
return Match_Success;
- break;
+ return Match_InvalidOperand;
}
if (!Op.isImm())
return Match_InvalidOperand;
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 9ce00f76d9c7..1b65589416c3 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -16,9 +16,10 @@
#include "TargetInfo/AArch64TargetInfo.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCDecoderOps.h"
#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
@@ -37,213 +38,226 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
// Forward declare these because the autogenerated code will reference them.
// Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst,
- unsigned RegNo,
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+ const MCDisassembler *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
template <unsigned NumBitsForTile>
static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
- unsigned RegMask,
- uint64_t Address,
- const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
- uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
- uint64_t Address, const void *Decoder);
+ uint64_t Address,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder);
-static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder);
-static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
+ const MCDisassembler *Decoder);
template <int Bits>
static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder);
+ uint64_t Addr,
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder);
+ const MCDisassembler *Decoder);
static bool Check(DecodeStatus &Out, DecodeStatus In) {
switch (In) {
@@ -270,7 +284,8 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
static MCDisassembler *createAArch64Disassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
- return new AArch64Disassembler(STI, Ctx);
+
+ return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
}
DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
@@ -295,67 +310,37 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
DecodeStatus Result =
decodeInstruction(Table, MI, Insn, Address, this, STI);
- switch (MI.getOpcode()) {
- default:
- break;
+ const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+
// For Scalable Matrix Extension (SME) instructions that have an implicit
- // operand for the accumulator (ZA) which isn't encoded, manually insert
- // operand.
- case AArch64::LDR_ZA:
- case AArch64::STR_ZA: {
- MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZA));
- // Spill and fill instructions have a single immediate used for both the
- // vector select offset and optional memory offset. Replicate the decoded
- // immediate.
+ // operand for the accumulator (ZA) or implicit immediate zero which isn't
+ // encoded, manually insert operand.
+ for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
+ if (Desc.OpInfo[i].OperandType == MCOI::OPERAND_REGISTER) {
+ switch (Desc.OpInfo[i].RegClass) {
+ default:
+ break;
+ case AArch64::MPRRegClassID:
+ MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
+ break;
+ case AArch64::MPR8RegClassID:
+ MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
+ break;
+ }
+ } else if (Desc.OpInfo[i].OperandType ==
+ AArch64::OPERAND_IMPLICIT_IMM_0) {
+ MI.insert(MI.begin() + i, MCOperand::createImm(0));
+ }
+ }
+
+ if (MI.getOpcode() == AArch64::LDR_ZA ||
+ MI.getOpcode() == AArch64::STR_ZA) {
+ // Spill and fill instructions have a single immediate used for both
+ // the vector select offset and optional memory offset. Replicate
+ // the decoded immediate.
const MCOperand &Imm4Op = MI.getOperand(2);
assert(Imm4Op.isImm() && "Unexpected operand type!");
MI.addOperand(Imm4Op);
- break;
- }
- case AArch64::LD1_MXIPXX_H_B:
- case AArch64::LD1_MXIPXX_V_B:
- case AArch64::ST1_MXIPXX_H_B:
- case AArch64::ST1_MXIPXX_V_B:
- case AArch64::INSERT_MXIPZ_H_B:
- case AArch64::INSERT_MXIPZ_V_B:
- // e.g.
- // MOVA ZA0<HV>.B[<Ws>, <imm>], <Pg>/M, <Zn>.B
- // ^ insert implicit 8-bit element tile
- MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZAB0));
- break;
- case AArch64::EXTRACT_ZPMXI_H_B:
- case AArch64::EXTRACT_ZPMXI_V_B:
- // MOVA <Zd>.B, <Pg>/M, ZA0<HV>.B[<Ws>, <imm>]
- // ^ insert implicit 8-bit element tile
- MI.insert(MI.begin()+2, MCOperand::createReg(AArch64::ZAB0));
- break;
- case AArch64::LD1_MXIPXX_H_Q:
- case AArch64::LD1_MXIPXX_V_Q:
- case AArch64::ST1_MXIPXX_H_Q:
- case AArch64::ST1_MXIPXX_V_Q:
- // 128-bit load/store have implicit zero vector index.
- MI.insert(MI.begin()+2, MCOperand::createImm(0));
- break;
- // 128-bit mova have implicit zero vector index.
- case AArch64::INSERT_MXIPZ_H_Q:
- case AArch64::INSERT_MXIPZ_V_Q:
- MI.insert(MI.begin()+2, MCOperand::createImm(0));
- break;
- case AArch64::EXTRACT_ZPMXI_H_Q:
- case AArch64::EXTRACT_ZPMXI_V_Q:
- MI.addOperand(MCOperand::createImm(0));
- break;
- case AArch64::SMOVvi8to32_idx0:
- case AArch64::SMOVvi8to64_idx0:
- case AArch64::SMOVvi16to32_idx0:
- case AArch64::SMOVvi16to64_idx0:
- case AArch64::SMOVvi32to64_idx0:
- case AArch64::UMOVvi8_idx0:
- case AArch64::UMOVvi16_idx0:
- case AArch64::UMOVvi32_idx0:
- case AArch64::UMOVvi64_idx0:
- MI.addOperand(MCOperand::createImm(0));
- break;
}
if (Result != MCDisassembler::Fail)
@@ -400,7 +385,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -410,9 +395,9 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 15)
return Fail;
return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
@@ -420,7 +405,7 @@ static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -432,7 +417,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -444,7 +429,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -456,7 +441,7 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -466,9 +451,9 @@ static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
-static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 30)
return Fail;
@@ -481,7 +466,7 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -491,10 +476,9 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
-static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
+static DecodeStatus
+DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+ const MCDisassembler *Decoder) {
if (RegNo > 22)
return Fail;
if (RegNo & 1)
@@ -509,7 +493,7 @@ static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -518,10 +502,10 @@ static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
-static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 3)
return Fail;
@@ -534,7 +518,7 @@ static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -546,7 +530,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -558,7 +542,7 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void* Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
@@ -570,7 +554,7 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 15)
return Fail;
return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -578,7 +562,7 @@ static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 7)
return Fail;
return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -586,7 +570,7 @@ static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void* Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -597,7 +581,7 @@ static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void* Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -608,7 +592,7 @@ static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const void* Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -617,10 +601,10 @@ static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
-static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst,
- unsigned RegMask,
- uint64_t Address,
- const void *Decoder) {
+static DecodeStatus
+DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
if (RegMask > 0xFF)
return Fail;
Inst.addOperand(MCOperand::createImm(RegMask));
@@ -641,7 +625,8 @@ static const SmallVector<SmallVector<unsigned, 16>, 5>
template <unsigned NumBitsForTile>
static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
unsigned LastReg = (1 << NumBitsForTile) - 1;
if (RegNo > LastReg)
return Fail;
@@ -651,7 +636,8 @@ static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
}
static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 15)
return Fail;
@@ -663,7 +649,7 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void* Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 7)
return Fail;
@@ -672,7 +658,8 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
}
static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -682,7 +669,8 @@ static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
}
static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -693,7 +681,7 @@ static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -703,7 +691,8 @@ static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
}
static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -713,7 +702,8 @@ static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
}
static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -724,7 +714,7 @@ static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (RegNo > 31)
return Fail;
unsigned Register =
@@ -735,7 +725,7 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
// scale{5} is asserted as 1 in tblgen.
Imm |= 0x20;
Inst.addOperand(MCOperand::createImm(64 - Imm));
@@ -744,29 +734,29 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
Inst.addOperand(MCOperand::createImm(64 - Imm));
return Success;
}
static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
int64_t ImmVal = Imm;
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
// Sign-extend 19-bit immediate.
if (ImmVal & (1 << (19 - 1)))
ImmVal |= ~((1LL << 19) - 1);
- if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr,
- Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(
+ Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(ImmVal));
return Success;
}
static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
- uint64_t Address, const void *Decoder) {
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1));
Inst.addOperand(MCOperand::createImm(Imm & 1));
return Success;
@@ -774,7 +764,7 @@ static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
Inst.addOperand(MCOperand::createImm(Imm));
// Every system register in the encoding space is valid with the syntax
@@ -784,7 +774,7 @@ static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
Inst.addOperand(MCOperand::createImm(Imm));
return Success;
@@ -792,7 +782,7 @@ static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
// This decoder exists to add the dummy Lane operand to the MCInst, which must
// be 1 in assembly but has no other real manifestation.
unsigned Rd = fieldFromInstruction(Insn, 0, 5);
@@ -826,66 +816,74 @@ static DecodeStatus DecodeVecShiftLImm(MCInst &Inst, unsigned Imm,
}
static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm, 64);
}
static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
}
static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm, 32);
}
static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
}
static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm, 16);
}
static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
}
static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftRImm(Inst, Imm, 8);
}
static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftLImm(Inst, Imm, 64);
}
static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftLImm(Inst, Imm, 32);
}
static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftLImm(Inst, Imm, 16);
}
static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeVecShiftLImm(Inst, Imm, 8);
}
-static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Rm = fieldFromInstruction(insn, 16, 5);
@@ -947,7 +945,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned imm = fieldFromInstruction(insn, 5, 16);
unsigned shift = fieldFromInstruction(insn, 21, 2);
@@ -978,14 +976,12 @@ static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
return Success;
}
-static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned offset = fieldFromInstruction(insn, 10, 12);
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
switch (Inst.getOpcode()) {
default:
@@ -1034,14 +1030,14 @@ static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn,
}
DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
- if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(offset));
return Success;
}
static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
int64_t offset = fieldFromInstruction(insn, 12, 9);
@@ -1237,9 +1233,9 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
return Success;
}
-static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
@@ -1322,7 +1318,7 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
@@ -1456,7 +1452,7 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 |
@@ -1489,7 +1485,7 @@ static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Rm = fieldFromInstruction(insn, 16, 5);
@@ -1546,7 +1542,7 @@ static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Datasize = fieldFromInstruction(insn, 31, 1);
@@ -1577,7 +1573,7 @@ static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned cmode = fieldFromInstruction(insn, 12, 4);
unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
@@ -1616,7 +1612,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned cmode = fieldFromInstruction(insn, 12, 4);
unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
@@ -1633,26 +1629,26 @@ static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
}
static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
imm |= fieldFromInstruction(insn, 29, 2);
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
// Sign-extend the 21-bit immediate.
if (imm & (1 << (21 - 1)))
imm |= ~((1LL << 21) - 1);
DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
- if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(imm));
return Success;
}
static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Imm = fieldFromInstruction(insn, 10, 14);
@@ -1661,8 +1657,6 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
unsigned ShifterVal = (Imm >> 12) & 3;
unsigned ImmVal = Imm & 0xFFF;
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
if (ShifterVal != 0 && ShifterVal != 1)
return Fail;
@@ -1681,7 +1675,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
}
- if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(ImmVal));
Inst.addOperand(MCOperand::createImm(12 * ShifterVal));
return Success;
@@ -1689,24 +1683,22 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
int64_t imm = fieldFromInstruction(insn, 0, 26);
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
// Sign-extend the 26-bit immediate.
if (imm & (1 << (26 - 1)))
imm |= ~((1LL << 26) - 1);
- if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(imm));
return Success;
}
-static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+ const MCDisassembler *Decoder) {
uint64_t op1 = fieldFromInstruction(insn, 16, 3);
uint64_t op2 = fieldFromInstruction(insn, 5, 3);
uint64_t crm = fieldFromInstruction(insn, 8, 4);
@@ -1726,22 +1718,20 @@ static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn,
Inst.addOperand(MCOperand::createImm(pstate_field));
Inst.addOperand(MCOperand::createImm(crm));
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
- if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+ if (PState &&
+ PState->haveFeatures(Decoder->getSubtargetInfo().getFeatureBits()))
return Success;
return Fail;
}
static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
uint64_t Rt = fieldFromInstruction(insn, 0, 5);
uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
bit |= fieldFromInstruction(insn, 19, 5);
int64_t dst = fieldFromInstruction(insn, 5, 14);
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
// Sign-extend 14-bit immediate.
if (dst & (1 << (14 - 1)))
@@ -1752,17 +1742,16 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
else
DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
Inst.addOperand(MCOperand::createImm(bit));
- if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
+ if (!Decoder->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(dst));
return Success;
}
-static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
- unsigned RegClassID,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegClassID,
+ unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder) {
// Register number must be even (see CASP instruction)
if (RegNo & 0x1)
return Fail;
@@ -1772,27 +1761,25 @@ static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
return Success;
}
-static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeGPRSeqPairsClassRegisterClass(Inst,
AArch64::WSeqPairsClassRegClassID,
RegNo, Addr, Decoder);
}
-static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
- unsigned RegNo,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+ const MCDisassembler *Decoder) {
return DecodeGPRSeqPairsClassRegisterClass(Inst,
AArch64::XSeqPairsClassRegClassID,
RegNo, Addr, Decoder);
}
-static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const void *Decoder) {
+static DecodeStatus
+DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Zdn = fieldFromInstruction(insn, 0, 5);
unsigned imm = fieldFromInstruction(insn, 5, 13);
if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
@@ -1808,7 +1795,7 @@ static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
template <int Bits>
static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (Imm & ~((1LL << Bits) - 1))
return Fail;
@@ -1822,8 +1809,8 @@ static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
// Decode 8-bit signed/unsigned immediate for a given element width.
template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
+ const MCDisassembler *Decoder) {
unsigned Val = (uint8_t)Imm;
unsigned Shift = (Imm & 0x100) ? 8 : 0;
if (ElementWidth == 8 && Shift)
@@ -1835,13 +1822,14 @@ static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
// Decode uimm4 ranged from 1-16.
static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
- uint64_t Addr, const void *Decoder) {
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
Inst.addOperand(MCOperand::createImm(Imm + 1));
return Success;
}
static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
if (AArch64SVCR::lookupSVCRByEncoding(Imm)) {
Inst.addOperand(MCOperand::createImm(Imm));
return Success;
@@ -1851,7 +1839,7 @@ static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rs = fieldFromInstruction(insn, 16, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
@@ -1876,7 +1864,7 @@ static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
- const void *Decoder) {
+ const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
unsigned Rm = fieldFromInstruction(insn, 16, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 374a89edcb74..6761d449a7f4 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -13,13 +13,17 @@
#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInstrInfo.h"
namespace llvm {
class AArch64Disassembler : public MCDisassembler {
+ std::unique_ptr<const MCInstrInfo> const MCII;
+
public:
- AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
- : MCDisassembler(STI, Ctx) {}
+ AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ MCInstrInfo const *MCII)
+ : MCDisassembler(STI, Ctx), MCII(MCII) {}
~AArch64Disassembler() override = default;
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 5b6f06f8dbb4..11964b2075e5 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -60,7 +60,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) {
/// an operand to the MCInst and Fail otherwise.
bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
- bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+ bool IsBranch, uint64_t Offset, uint64_t OpSize, uint64_t InstSize) {
if (!SymbolLookUp)
return false;
// FIXME: This method shares a lot of code with
@@ -73,8 +73,8 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
SymbolicOp.Value = Value;
uint64_t ReferenceType;
const char *ReferenceName;
- if (!GetOpInfo ||
- !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+ if (!GetOpInfo || !GetOpInfo(DisInfo, Address, /*Offset=*/0, OpSize, InstSize,
+ 1, &SymbolicOp)) {
if (IsBranch) {
ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index dc72331660cc..ca677db49739 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -29,7 +29,8 @@ public:
bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
int64_t Value, uint64_t Address, bool IsBranch,
- uint64_t Offset, uint64_t InstSize) override;
+ uint64_t Offset, uint64_t OpSize,
+ uint64_t InstSize) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 097b93e4fcca..89e1d85a6085 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -18,6 +18,7 @@
#include "AArch64Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -1058,10 +1059,10 @@ bool AArch64CallLowering::lowerTailCall(
// If Callee is a reg, since it is used by a target specific instruction,
// it must have a register class matching the constraint of that instruction.
- if (Info.Callee.isReg())
+ if (MIB->getOperand(0).isReg())
constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
*MF.getSubtarget().getRegBankInfo(), *MIB,
- MIB->getDesc(), Info.Callee, 0);
+ MIB->getDesc(), MIB->getOperand(0), 0);
MF.getFrameInfo().setHasTailCall();
Info.LoweredTailCall = true;
@@ -1127,14 +1128,39 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ unsigned Opc = 0;
+ // Calls with operand bundle "clang.arc.attachedcall" are special. They should
+ // be expanded to the call, directly followed by a special marker sequence and
+ // a call to an ObjC library function.
+ if (Info.CB && objcarc::hasAttachedCallOpBundle(Info.CB))
+ Opc = AArch64::BLR_RVMARKER;
+ // A call to a returns twice function like setjmp must be followed by a bti
+ // instruction.
+ else if (Info.CB &&
+ Info.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
+ !Subtarget.noBTIAtReturnTwice() &&
+ MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
+ Opc = AArch64::BLR_BTI;
+ else
+ Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ unsigned CalleeOpNo = 0;
+
+ if (Opc == AArch64::BLR_RVMARKER) {
+ // Add a target global address for the retainRV/claimRV runtime function
+ // just before the call target.
+ Function *ARCFn = *objcarc::getAttachedARCFunction(Info.CB);
+ MIB.addGlobalAddress(ARCFn);
+ ++CalleeOpNo;
+ }
+
MIB.add(Info.Callee);
// Tell the call which registers are clobbered.
const uint32_t *Mask;
- const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const auto *TRI = Subtarget.getRegisterInfo();
AArch64OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg,
@@ -1160,10 +1186,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
- if (Info.Callee.isReg())
+ if (MIB->getOperand(CalleeOpNo).isReg())
constrainOperandRegClass(MF, *TRI, MRI, *Subtarget.getInstrInfo(),
*Subtarget.getRegBankInfo(), *MIB, MIB->getDesc(),
- Info.Callee, 0);
+ MIB->getOperand(CalleeOpNo), CalleeOpNo);
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 703e356f016d..9a65687735fe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -21,13 +21,16 @@
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -38,9 +41,9 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
-#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -62,6 +65,7 @@ namespace {
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATE_BITSET
+
class AArch64InstructionSelector : public InstructionSelector {
public:
AArch64InstructionSelector(const AArch64TargetMachine &TM,
@@ -293,6 +297,20 @@ private:
emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
+ /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+ /// In some cases this is even possible with OR operations in the expression.
+ MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
+ MachineIRBuilder &MIB) const;
+ MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
+ CmpInst::Predicate CC,
+ AArch64CC::CondCode Predicate,
+ AArch64CC::CondCode OutCC,
+ MachineIRBuilder &MIB) const;
+ MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
+ bool Negate, Register CCOp,
+ AArch64CC::CondCode Predicate,
+ MachineIRBuilder &MIB) const;
+
/// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
/// \p IsNegative is true if the test should be "not zero".
/// This will also optimize the test bit instruction when possible.
@@ -419,12 +437,16 @@ private:
int OpIdx = -1) const;
void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;
+ void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx = -1) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
// Optimization methods.
- bool tryOptSelect(MachineInstr &MI);
+ bool tryOptSelect(GSelect &Sel);
+ bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
@@ -485,9 +507,11 @@ AArch64InstructionSelector::AArch64InstructionSelector(
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
+//
+/// Given a register bank, and a type, return the smallest register class that
+/// can represent that combination.
static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
- const RegisterBankInfo &RBI,
bool GetAllRegSet = false) {
if (RB.getID() == AArch64::GPRRegBankID) {
if (Ty.getSizeInBits() <= 32)
@@ -828,39 +852,6 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
return GenericOpc;
}
-#ifndef NDEBUG
-/// Helper function that verifies that we have a valid copy at the end of
-/// selectCopy. Verifies that the source and dest have the expected sizes and
-/// then returns true.
-static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
- const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI,
- const RegisterBankInfo &RBI) {
- const Register DstReg = I.getOperand(0).getReg();
- const Register SrcReg = I.getOperand(1).getReg();
- const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
- const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-
- // Make sure the size of the source and dest line up.
- assert(
- (DstSize == SrcSize ||
- // Copies are a mean to setup initial types, the number of
- // bits may not exactly match.
- (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
- // Copies are a mean to copy bits around, as long as we are
- // on the same register class, that's fine. Otherwise, that
- // means we need some SUBREG_TO_REG or AND & co.
- (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
- "Copy with different width?!");
-
- // Check the size of the destination.
- assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
- "GPRs cannot get more than 64-bit width values");
-
- return true;
-}
-#endif
-
/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
/// to \p *To.
///
@@ -935,31 +926,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return false;
}
- // A couple helpers below, for making sure that the copy we produce is valid.
-
- // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
- // to verify that the src and dst are the same size, since that's handled by
- // the SUBREG_TO_REG.
- bool KnownValid = false;
-
- // Returns true, or asserts if something we don't expect happens. Instead of
- // returning true, we return isValidCopy() to ensure that we verify the
- // result.
- auto CheckCopy = [&]() {
- // If we have a bitcast or something, we can't have physical registers.
- assert((I.isCopy() ||
- (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
- !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
- "No phys reg on generic operator!");
- bool ValidCopy = true;
-#ifndef NDEBUG
- ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
- assert(ValidCopy && "Invalid copy.");
-#endif
- (void)KnownValid;
- return ValidCopy;
- };
-
// Is this a copy? If so, then we may need to insert a subregister copy.
if (I.isCopy()) {
// Yes. Check if there's anything to fix up.
@@ -1004,15 +970,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
.addImm(SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(PromoteReg);
-
- // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
- KnownValid = true;
}
// If the destination is a physical register, then there's nothing to
// change, so we're done.
if (Register::isPhysicalRegister(DstReg))
- return CheckCopy();
+ return true;
}
// No need to constrain SrcReg. It will get constrained when we hit another
@@ -1032,7 +995,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
}
I.setDesc(TII.get(AArch64::COPY));
- return CheckCopy();
+ return true;
}
static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
@@ -1309,6 +1272,90 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
}
}
+/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
+static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ llvm_unreachable("Unknown FP condition!");
+ case CmpInst::FCMP_OEQ:
+ CondCode = AArch64CC::EQ;
+ break;
+ case CmpInst::FCMP_OGT:
+ CondCode = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_OGE:
+ CondCode = AArch64CC::GE;
+ break;
+ case CmpInst::FCMP_OLT:
+ CondCode = AArch64CC::MI;
+ break;
+ case CmpInst::FCMP_OLE:
+ CondCode = AArch64CC::LS;
+ break;
+ case CmpInst::FCMP_ONE:
+ CondCode = AArch64CC::MI;
+ CondCode2 = AArch64CC::GT;
+ break;
+ case CmpInst::FCMP_ORD:
+ CondCode = AArch64CC::VC;
+ break;
+ case CmpInst::FCMP_UNO:
+ CondCode = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UEQ:
+ CondCode = AArch64CC::EQ;
+ CondCode2 = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_UGT:
+ CondCode = AArch64CC::HI;
+ break;
+ case CmpInst::FCMP_UGE:
+ CondCode = AArch64CC::PL;
+ break;
+ case CmpInst::FCMP_ULT:
+ CondCode = AArch64CC::LT;
+ break;
+ case CmpInst::FCMP_ULE:
+ CondCode = AArch64CC::LE;
+ break;
+ case CmpInst::FCMP_UNE:
+ CondCode = AArch64CC::NE;
+ break;
+ }
+}
+
+/// Convert an IR fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
+ AArch64CC::CondCode &CondCode,
+ AArch64CC::CondCode &CondCode2) {
+ CondCode2 = AArch64CC::AL;
+ switch (CC) {
+ default:
+ changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
+ assert(CondCode2 == AArch64CC::AL);
+ break;
+ case CmpInst::FCMP_ONE:
+ // (a one b)
+ // == ((a olt b) || (a ogt b))
+ // == ((a ord b) && (a une b))
+ CondCode = AArch64CC::VC;
+ CondCode2 = AArch64CC::NE;
+ break;
+ case CmpInst::FCMP_UEQ:
+ // (a ueq b)
+ // == ((a uno b) || (a oeq b))
+ // == ((a ule b) && (a uge b))
+ CondCode = AArch64CC::PL;
+ CondCode2 = AArch64CC::LE;
+ break;
+ }
+}
+
/// Return a register which can be used as a bit to test in a TB(N)Z.
static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
MachineRegisterInfo &MRI) {
@@ -1703,7 +1750,6 @@ static Optional<int64_t> getVectorShiftImm(Register Reg,
MachineRegisterInfo &MRI) {
assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
MachineInstr *OpMI = MRI.getVRegDef(Reg);
- assert(OpMI && "Expected to find a vreg def for vector shift operand");
return getAArch64VectorSplatScalar(*OpMI, MRI);
}
@@ -1810,7 +1856,7 @@ bool AArch64InstructionSelector::selectVectorAshrLshr(
unsigned Opc = 0;
unsigned NegOpc = 0;
const TargetRegisterClass *RC =
- getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
+ getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
if (Ty == LLT::fixed_vector(2, 64)) {
Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
NegOpc = AArch64::NEGv2i64;
@@ -2266,6 +2312,16 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
I.eraseFromParent();
return true;
}
+ case TargetOpcode::G_FENCE: {
+ if (I.getOperand(1).getImm() == 0)
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CompilerBarrier))
+ .addImm(I.getOperand(0).getImm());
+ else
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::DMB))
+ .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
+ I.eraseFromParent();
+ return true;
+ }
default:
return false;
}
@@ -2279,8 +2335,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const AArch64Subtarget *Subtarget =
- &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+ const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
if (Subtarget->requiresStrictAlign()) {
// We don't support this feature yet.
LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
@@ -2312,7 +2367,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return false;
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
- DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
+ DefRC = getRegClassForTypeOnBank(DefTy, RB);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
@@ -2488,7 +2543,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// The case when we have 0.0 is covered by tablegen. Reject it here so we
// can be sure tablegen works correctly and isn't rescued by this code.
- // 0.0 is not covered by tablegen for FP128. So we will handle this
+ // 0.0 is not covered by tablegen for FP128. So we will handle this
// scenario in the code here.
if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
return false;
@@ -2510,7 +2565,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
if (isFP) {
- const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize);
+ const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
// For 16, 64, and 128b values, emit a constant pool load.
switch (DefSize) {
default:
@@ -2735,12 +2790,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return false;
if (isa<GLoad>(LdSt)) {
- static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
- AArch64::LDARW, AArch64::LDARX};
+ static constexpr unsigned LDAPROpcodes[] = {
+ AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
+ static constexpr unsigned LDAROpcodes[] = {
+ AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
+ ArrayRef<unsigned> Opcodes =
+ STI.hasLDAPR() && Order != AtomicOrdering::SequentiallyConsistent
+ ? LDAPROpcodes
+ : LDAROpcodes;
I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
} else {
- static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
- AArch64::STLRW, AArch64::STLRX};
+ static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
+ AArch64::STLRW, AArch64::STLRX};
Register ValReg = LdSt.getReg(0);
if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
// Emit a subreg copy of 32 bits.
@@ -2774,7 +2835,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
unsigned SubReg;
LLT MemTy = LdSt.getMMO().getMemoryType();
- auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+ auto *RC = getRegClassForTypeOnBank(MemTy, RB);
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
@@ -2790,7 +2851,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
if (RB.getID() == AArch64::FPRRegBankID) {
unsigned SubReg;
LLT MemTy = LdSt.getMMO().getMemoryType();
- auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+ auto *RC = getRegClassForTypeOnBank(MemTy, RB);
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
Register OldDst = LdSt.getReg(0);
@@ -2804,7 +2865,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
.addImm(0)
.addUse(NewDst)
.addImm(SubReg);
- auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI);
+ auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
MIB.setInstr(LdSt);
}
@@ -2934,8 +2995,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
ShiftTy.getSizeInBits() == 64) {
assert(!ShiftTy.isVector() && "unexpected vector shift ty");
- assert(MRI.getVRegDef(ShiftReg) &&
- "could not find a vreg definition for shift amount");
// Insert a subregister copy to implement a 64->32 trunc
auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
.addReg(ShiftReg, 0, AArch64::sub_32);
@@ -2944,10 +3003,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
}
LLVM_FALLTHROUGH;
- case TargetOpcode::G_FADD:
- case TargetOpcode::G_FSUB:
- case TargetOpcode::G_FMUL:
- case TargetOpcode::G_FDIV:
case TargetOpcode::G_OR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -3026,13 +3081,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
if (DstRB.getID() == AArch64::GPRRegBankID) {
- const TargetRegisterClass *DstRC =
- getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+ const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
if (!DstRC)
return false;
- const TargetRegisterClass *SrcRC =
- getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
+ const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
if (!SrcRC)
return false;
@@ -3270,6 +3323,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
I.setDesc(TII.get(NewOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ I.setFlags(MachineInstr::NoFPExcept);
return true;
}
@@ -3291,17 +3345,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_SELECT: {
- if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+ auto &Sel = cast<GSelect>(I);
+ if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
<< ", expected: " << LLT::scalar(1) << '\n');
return false;
}
- const Register CondReg = I.getOperand(1).getReg();
- const Register TReg = I.getOperand(2).getReg();
- const Register FReg = I.getOperand(3).getReg();
+ const Register CondReg = Sel.getCondReg();
+ const Register TReg = Sel.getTrueReg();
+ const Register FReg = Sel.getFalseReg();
- if (tryOptSelect(I))
+ if (tryOptSelect(Sel))
return true;
// Make sure to use an unused vreg instead of wzr, so that the peephole
@@ -3310,9 +3365,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
- if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+ if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
return false;
- I.eraseFromParent();
+ Sel.eraseFromParent();
return true;
}
case TargetOpcode::G_ICMP: {
@@ -3357,8 +3412,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const Register DstReg = I.getOperand(0).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
- const TargetRegisterClass *DstRC =
- getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+ const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
return true;
}
@@ -3871,7 +3925,7 @@ bool AArch64InstructionSelector::selectVectorICmp(
const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
const TargetRegisterClass *SrcRC =
- getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
+ getRegClassForTypeOnBank(SrcTy, VecRB, true);
if (!SrcRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return false;
@@ -4037,7 +4091,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
}
const TargetRegisterClass *DstRC =
- getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
+ getRegClassForTypeOnBank(ScalarTy, DstRB, true);
if (!DstRC) {
LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
return nullptr;
@@ -4046,7 +4100,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
const LLT &VecTy = MRI.getType(VecReg);
const TargetRegisterClass *VecRC =
- getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
+ getRegClassForTypeOnBank(VecTy, VecRB, true);
if (!VecRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return nullptr;
@@ -4205,9 +4259,9 @@ bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
} else {
// No. We have to perform subregister inserts. For each insert, create an
// implicit def and a subregister insert, and save the register we create.
- const TargetRegisterClass *RC =
- getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
- WideTy.getScalarSizeInBits() * NumElts);
+ const TargetRegisterClass *RC = getRegClassForTypeOnBank(
+ LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
+ *RBI.getRegBank(SrcReg, MRI, TRI));
unsigned SubReg = 0;
bool Found = getSubRegForClass(RC, TRI, SubReg);
(void)Found;
@@ -4594,6 +4648,7 @@ AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
// Partially build the compare. Decide if we need to add a use for the
// third operand based off whether or not we're comparing against 0.0.
auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
+ CmpMI.setMIFlags(MachineInstr::NoFPExcept);
if (!ShouldUseImm)
CmpMI.addUse(RHS);
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
@@ -4632,7 +4687,7 @@ MachineInstr *AArch64InstructionSelector::emitVectorConcat(
const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
const TargetRegisterClass *DstRC =
- getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
+ getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
MachineInstr *WidenedOp1 =
emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
@@ -4701,7 +4756,256 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
}
}
-bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
+/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
+/// expressed as a conjunction.
+/// \param CanNegate Set to true if we can negate the whole sub-tree just by
+/// changing the conditions on the CMP tests.
+/// (this means we can call emitConjunctionRec() with
+/// Negate==true on this sub-tree)
+/// \param MustBeFirst Set to true if this subtree needs to be negated and we
+/// cannot do the negation naturally. We are required to
+/// emit the subtree first in this case.
+/// \param WillNegate Is true if are called when the result of this
+/// subexpression must be negated. This happens when the
+/// outer expression is an OR. We can use this fact to know
+/// that we have a double negation (or (or ...) ...) that
+/// can be implemented for free.
+static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
+ bool WillNegate, MachineRegisterInfo &MRI,
+ unsigned Depth = 0) {
+ if (!MRI.hasOneNonDBGUse(Val))
+ return false;
+ MachineInstr *ValDef = MRI.getVRegDef(Val);
+ unsigned Opcode = ValDef->getOpcode();
+ if (Opcode == TargetOpcode::G_TRUNC) {
+ // Look through a trunc.
+ Val = ValDef->getOperand(1).getReg();
+ ValDef = MRI.getVRegDef(Val);
+ Opcode = ValDef->getOpcode();
+ }
+ if (isa<GAnyCmp>(ValDef)) {
+ CanNegate = true;
+ MustBeFirst = false;
+ return true;
+ }
+ // Protect against exponential runtime and stack overflow.
+ if (Depth > 6)
+ return false;
+ if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
+ bool IsOR = Opcode == TargetOpcode::G_OR;
+ Register O0 = ValDef->getOperand(1).getReg();
+ Register O1 = ValDef->getOperand(2).getReg();
+ bool CanNegateL;
+ bool MustBeFirstL;
+ if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
+ return false;
+ bool CanNegateR;
+ bool MustBeFirstR;
+ if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
+ return false;
+
+ if (MustBeFirstL && MustBeFirstR)
+ return false;
+
+ if (IsOR) {
+ // For an OR expression we need to be able to naturally negate at least
+ // one side or we cannot do the transformation at all.
+ if (!CanNegateL && !CanNegateR)
+ return false;
+ // If we the result of the OR will be negated and we can naturally negate
+ // the leaves, then this sub-tree as a whole negates naturally.
+ CanNegate = WillNegate && CanNegateL && CanNegateR;
+ // If we cannot naturally negate the whole sub-tree, then this must be
+ // emitted first.
+ MustBeFirst = !CanNegate;
+ } else {
+ assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
+ // We cannot naturally negate an AND operation.
+ CanNegate = false;
+ MustBeFirst = MustBeFirstL || MustBeFirstR;
+ }
+ return true;
+ }
+ return false;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
+ Register LHS, Register RHS, CmpInst::Predicate CC,
+ AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
+ MachineIRBuilder &MIB) const {
+ // TODO: emit CMN as an optimization.
+ auto &MRI = *MIB.getMRI();
+ LLT OpTy = MRI.getType(LHS);
+ assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
+ unsigned CCmpOpc;
+ if (CmpInst::isIntPredicate(CC)) {
+ CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+ } else {
+ switch (OpTy.getSizeInBits()) {
+ case 16:
+ CCmpOpc = AArch64::FCCMPHrr;
+ break;
+ case 32:
+ CCmpOpc = AArch64::FCCMPSrr;
+ break;
+ case 64:
+ CCmpOpc = AArch64::FCCMPDrr;
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+ auto CCmp =
+ MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
+ constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
+ return &*CCmp;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
+ Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
+ AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
+ // We're at a tree leaf, produce a conditional comparison operation.
+ auto &MRI = *MIB.getMRI();
+ MachineInstr *ValDef = MRI.getVRegDef(Val);
+ unsigned Opcode = ValDef->getOpcode();
+ if (Opcode == TargetOpcode::G_TRUNC) {
+ // Look through a trunc.
+ Val = ValDef->getOperand(1).getReg();
+ ValDef = MRI.getVRegDef(Val);
+ Opcode = ValDef->getOpcode();
+ }
+ if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
+ Register LHS = Cmp->getLHSReg();
+ Register RHS = Cmp->getRHSReg();
+ CmpInst::Predicate CC = Cmp->getCond();
+ if (Negate)
+ CC = CmpInst::getInversePredicate(CC);
+ if (isa<GICmp>(Cmp)) {
+ OutCC = changeICMPPredToAArch64CC(CC);
+ } else {
+ // Handle special FP cases.
+ AArch64CC::CondCode ExtraCC;
+ changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+ // Some floating point conditions can't be tested with a single condition
+ // code. Construct an additional comparison in this case.
+ if (ExtraCC != AArch64CC::AL) {
+ MachineInstr *ExtraCmp;
+ if (!CCOp)
+ ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
+ else
+ ExtraCmp =
+ emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
+ CCOp = ExtraCmp->getOperand(0).getReg();
+ Predicate = ExtraCC;
+ }
+ }
+
+ // Produce a normal comparison if we are first in the chain
+ if (!CCOp) {
+ auto Dst = MRI.cloneVirtualRegister(LHS);
+ if (isa<GICmp>(Cmp))
+ return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
+ return emitFPCompare(Cmp->getOperand(2).getReg(),
+ Cmp->getOperand(3).getReg(), MIB);
+ }
+ // Otherwise produce a ccmp.
+ return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
+ }
+ assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
+
+ bool IsOR = Opcode == TargetOpcode::G_OR;
+
+ Register LHS = ValDef->getOperand(1).getReg();
+ bool CanNegateL;
+ bool MustBeFirstL;
+ bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
+ assert(ValidL && "Valid conjunction/disjunction tree");
+ (void)ValidL;
+
+ Register RHS = ValDef->getOperand(2).getReg();
+ bool CanNegateR;
+ bool MustBeFirstR;
+ bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
+ assert(ValidR && "Valid conjunction/disjunction tree");
+ (void)ValidR;
+
+ // Swap sub-tree that must come first to the right side.
+ if (MustBeFirstL) {
+ assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
+ std::swap(LHS, RHS);
+ std::swap(CanNegateL, CanNegateR);
+ std::swap(MustBeFirstL, MustBeFirstR);
+ }
+
+ bool NegateR;
+ bool NegateAfterR;
+ bool NegateL;
+ bool NegateAfterAll;
+ if (Opcode == TargetOpcode::G_OR) {
+ // Swap the sub-tree that we can negate naturally to the left.
+ if (!CanNegateL) {
+ assert(CanNegateR && "at least one side must be negatable");
+ assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
+ assert(!Negate);
+ std::swap(LHS, RHS);
+ NegateR = false;
+ NegateAfterR = true;
+ } else {
+ // Negate the left sub-tree if possible, otherwise negate the result.
+ NegateR = CanNegateR;
+ NegateAfterR = !CanNegateR;
+ }
+ NegateL = true;
+ NegateAfterAll = !Negate;
+ } else {
+ assert(Opcode == TargetOpcode::G_AND &&
+ "Valid conjunction/disjunction tree");
+ assert(!Negate && "Valid conjunction/disjunction tree");
+
+ NegateL = false;
+ NegateR = false;
+ NegateAfterR = false;
+ NegateAfterAll = false;
+ }
+
+ // Emit sub-trees.
+ AArch64CC::CondCode RHSCC;
+ MachineInstr *CmpR =
+ emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
+ if (NegateAfterR)
+ RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+ MachineInstr *CmpL = emitConjunctionRec(
+ LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
+ if (NegateAfterAll)
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ return CmpL;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunction(
+ Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
+ bool DummyCanNegate;
+ bool DummyMustBeFirst;
+ if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
+ *MIB.getMRI()))
+ return nullptr;
+ return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
+}
+
+bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
+ MachineInstr &CondMI) {
+ AArch64CC::CondCode AArch64CC;
+ MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
+ if (!ConjMI)
+ return false;
+
+ emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
+ SelI.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
MachineRegisterInfo &MRI = *MIB.getMRI();
// We want to recognize this pattern:
//
@@ -4750,12 +5054,12 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
}
// Is the condition defined by a compare?
- if (!CondDef)
- return false;
-
unsigned CondOpc = CondDef->getOpcode();
- if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
+ if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
+ if (tryOptSelectConjunction(I, *CondDef))
+ return true;
return false;
+ }
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
@@ -5081,7 +5385,7 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
// the original size to get the result we want.
Register DemoteVec = InsMI->getOperand(0).getReg();
const TargetRegisterClass *RC =
- getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
+ getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
return false;
@@ -5198,12 +5502,11 @@ bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
}))
return false;
unsigned SubReg;
- const TargetRegisterClass *EltRC =
- getMinClassForRegBank(EltRB, EltTy.getSizeInBits());
+ const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
if (!EltRC)
return false;
const TargetRegisterClass *DstRC =
- getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits());
+ getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
if (!DstRC)
return false;
if (!getSubRegForClass(EltRC, TRI, SubReg))
@@ -5261,7 +5564,7 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
if (DstSize < 128) {
// Force this to be FPR using the destination vector.
const TargetRegisterClass *RC =
- getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
+ getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
if (!RC)
return false;
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
@@ -5528,7 +5831,7 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
uint64_t Key = I.getOperand(3).getImm();
Register DiscReg = I.getOperand(4).getReg();
auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
- bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue();
+ bool IsDiscZero = DiscVal && DiscVal->isNullValue();
if (Key > 3)
return false;
@@ -5777,8 +6080,6 @@ AArch64InstructionSelector::selectExtendedSHL(
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
- if (!OffsetInst)
- return None;
unsigned OffsetOpc = OffsetInst->getOpcode();
bool LookedThroughZExt = false;
@@ -5932,7 +6233,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
- if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
+ if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
return None;
// If this is used more than once, let's not bother folding.
@@ -6112,14 +6413,12 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
- if (!RootDef)
- return None;
MachineOperand &OffImm = RootDef->getOperand(2);
if (!OffImm.isReg())
return None;
MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
- if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
+ if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
return None;
int64_t RHSC;
MachineOperand &RHSOp1 = RHS->getOperand(1);
@@ -6187,9 +6486,6 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
- if (!RootDef)
- return None;
-
if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
@@ -6210,27 +6506,26 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
MachineOperand &RHS = RootDef->getOperand(2);
MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
- unsigned Scale = Log2_32(Size);
- if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
- if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
- }};
+ int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+ if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+ [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
}};
- }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
+ }};
}
}
// Before falling back to our general case, check if the unscaled
// instructions can handle this. If so, that's preferable.
- if (selectAddrModeUnscaled(Root, Size).hasValue())
+ if (selectAddrModeUnscaled(Root, Size))
return None;
return {{
@@ -6269,8 +6564,6 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
// Check if the operand is defined by an instruction which corresponds to
// a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
- if (!ShiftInst)
- return None;
AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
if (ShType == AArch64_AM::InvalidShiftExtend)
return None;
@@ -6425,7 +6718,7 @@ AArch64InstructionSelector::selectArithExtendedRegister(
// to.
if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
- if (ExtInst && isDef32(*ExtInst))
+ if (isDef32(*ExtInst))
return None;
}
}
@@ -6450,7 +6743,7 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
Optional<int64_t> CstVal =
getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
assert(CstVal && "Expected constant value");
- MIB.addImm(CstVal.getValue());
+ MIB.addImm(*CstVal);
}
void AArch64InstructionSelector::renderLogicalImm32(
@@ -6498,6 +6791,17 @@ void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
}
+void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
+ MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+ assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
+ "Expected G_FCONSTANT");
+ MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
+ .getFPImm()
+ ->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue()));
+}
+
bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
const MachineInstr &MI, unsigned NumBytes) const {
if (!MI.mayLoadOrStore())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e9df7e001d38..74ec9373ce9e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -169,7 +169,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.scalarize(0);
getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
- .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32})
+ .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
.widenScalarOrEltToNextPow2(0)
.clampScalarOrElt(0, s32, s64)
.clampNumElements(0, v2s32, v4s32)
@@ -180,7 +180,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder({G_SMULO, G_UMULO})
.widenScalarToNextPow2(0, /*Min = */ 32)
.clampScalar(0, s32, s64)
- .lowerIf(typeIs(1, s1));
+ .lower();
getActionDefinitionsBuilder({G_SMULH, G_UMULH})
.legalFor({s64, v8s16, v16s8, v4s32})
@@ -308,7 +308,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// These extends are also legal
.legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
.widenScalarToNextPow2(0, /* MinSize = */8)
- .lowerIfMemSizeNotPow2()
+ .lowerIfMemSizeNotByteSizePow2()
.clampScalar(0, s8, s64)
.narrowScalarIf([=](const LegalityQuery &Query) {
// Clamp extending load results to 32-bits.
@@ -317,10 +317,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
Query.Types[0].getSizeInBits() > 32;
},
changeTo(0, s32))
- // Lower any any-extending loads left into G_ANYEXT and G_LOAD
- .lowerIf([=](const LegalityQuery &Query) {
- return Query.Types[0] != Query.MMODescrs[0].MemoryTy;
- })
.clampMaxNumElements(0, s8, 16)
.clampMaxNumElements(0, s16, 8)
.clampMaxNumElements(0, s32, 4)
@@ -536,7 +532,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
.lowerIf(
- all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0)));
+ all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
.customIf([](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 3dec980a819a..ba206bac68d1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -20,11 +20,13 @@
//===----------------------------------------------------------------------===//
#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -133,7 +135,7 @@ bool matchAArch64MulConstCombine(
if (!Const)
return false;
- const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits());
+ APInt ConstValue = Const->Value.sext(Ty.getSizeInBits());
// The following code is ported from AArch64ISelLowering.
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
@@ -258,7 +260,7 @@ void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
// %d(s64) = G_ZEXT %a(s32)
Observer.changingInstr(MI);
MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
- MI.RemoveOperand(2);
+ MI.removeOperand(2);
Observer.changedInstr(MI);
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 3ff67d188822..d7959a82c484 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -58,7 +58,7 @@ struct ShuffleVectorPseudo {
ShuffleVectorPseudo(unsigned Opc, Register Dst,
std::initializer_list<SrcOp> SrcOps)
: Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
- ShuffleVectorPseudo() {}
+ ShuffleVectorPseudo() = default;
};
/// Check if a vector shuffle corresponds to a REV instruction with the
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
index cc45c6642ac5..ce6f15a799b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
@@ -149,7 +149,7 @@ bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
"op in fcmp range: "
<< II);
II.setDesc(TII->get(NewOpc));
- II.RemoveOperand(DeadNZCVIdx);
+ II.removeOperand(DeadNZCVIdx);
// Changing the opcode can result in differing regclass requirements,
// e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
// Constrain the regclasses, possibly introducing a copy.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index d3f4130d2ba1..275949c5ee64 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -13,6 +13,7 @@
#include "AArch64GlobalISelUtils.h"
#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -162,13 +163,14 @@ static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
// Check whether folding this offset is legal. It must not go out of bounds of
// the referenced object to avoid violating the code model, and must be
- // smaller than 2^21 because this is the largest offset expressible in all
- // object formats.
+ // smaller than 2^20 because this is the largest offset expressible in all
+ // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
+ // stores an immediate signed 21 bit offset.)
//
// This check also prevents us from folding negative offsets, which will end
// up being treated in the same way as large positive ones. They could also
// cause code model violations, and aren't really common enough to matter.
- if (NewOffset >= (1 << 21))
+ if (NewOffset >= (1 << 20))
return false;
Type *T = GV->getValueType();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 515a5c63a559..f0b311289c41 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -12,20 +12,19 @@
//===----------------------------------------------------------------------===//
#include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -42,8 +41,8 @@
using namespace llvm;
-AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
- : AArch64GenRegisterBankInfo() {
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(
+ const TargetRegisterInfo &TRI) {
static llvm::once_flag InitializeRegisterBankFlag;
static auto InitializeRegisterBankOnce = [&]() {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 2d76e48d7df2..01ef0bd92d50 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#define GET_REGBANK_DECLARATIONS
#include "AArch64GenRegisterBank.inc"
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index dbb8e85713cb..e4b547e17f64 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -22,10 +22,10 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
@@ -470,7 +470,7 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
// We are properly aligned, so write NOPs as requested.
Count /= 4;
for (uint64_t i = 0; i != Count; ++i)
- support::endian::write<uint32_t>(OS, 0xd503201f, Endian);
+ OS.write("\x1f\x20\x03\xd5", 4);
return true;
}
@@ -592,17 +592,18 @@ public:
if (XReg != AArch64::FP)
return CU::UNWIND_ARM64_MODE_DWARF;
- assert(XReg == AArch64::FP && "Invalid frame pointer!");
- assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+ if (i + 2 >= e)
+ return CU::UNWIND_ARM64_MODE_DWARF;
const MCCFIInstruction &LRPush = Instrs[++i];
- assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
- "Link register not pushed!");
+ if (LRPush.getOperation() != MCCFIInstruction::OpOffset)
+ return CU::UNWIND_ARM64_MODE_DWARF;
const MCCFIInstruction &FPPush = Instrs[++i];
- assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
- "Frame pointer not pushed!");
+ if (FPPush.getOperation() != MCCFIInstruction::OpOffset)
+ return CU::UNWIND_ARM64_MODE_DWARF;
- assert(FPPush.getOffset() + 8 == LRPush.getOffset());
+ if (FPPush.getOffset() + 8 != LRPush.getOffset())
+ return CU::UNWIND_ARM64_MODE_DWARF;
CurOffset = FPPush.getOffset();
unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
@@ -611,8 +612,8 @@ public:
LRReg = getXRegFromWReg(LRReg);
FPReg = getXRegFromWReg(FPReg);
- assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
- "Pushing invalid registers for frame!");
+ if (LRReg != AArch64::LR || FPReg != AArch64::FP)
+ return CU::UNWIND_ARM64_MODE_DWARF;
// Indicate that the function has a frame.
CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
@@ -620,7 +621,8 @@ public:
break;
}
case MCCFIInstruction::OpDefCfaOffset: {
- assert(StackSize == 0 && "We already have the CFA offset!");
+ if (StackSize != 0)
+ return CU::UNWIND_ARM64_MODE_DWARF;
StackSize = std::abs(Inst.getOffset());
break;
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 78c0e90b1384..46edb12959d2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -254,6 +254,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
}
void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
+ getStreamer().getAssembler().registerSymbol(*Symbol);
cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index ee0870d9ef7a..5d2ba7ef02c0 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1340,11 +1340,6 @@ void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI,
O << getRegisterName(Even) << ", " << getRegisterName(Odd);
}
-static const unsigned MatrixZADRegisterTable[] = {
- AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3,
- AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7
-};
-
void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1362,7 +1357,7 @@ void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum,
unsigned Reg = RegMask & (1 << I);
if (Reg == 0)
continue;
- O << getRegisterName(MatrixZADRegisterTable[I]);
+ O << getRegisterName(AArch64::ZAD0 + I);
if (Printed + 1 != NumRegs)
O << ", ";
++Printed;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index ad97071434df..2901e5c0fe4d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -16,6 +16,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixup.h"
@@ -677,7 +678,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
#include "AArch64GenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx) {
return new AArch64MCCodeEmitter(MCII, Ctx);
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 844bd6bbada9..cb39c2a11487 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index c1186ae804d2..34e3b2cf58e4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -52,21 +52,14 @@ static MCSubtargetInfo *
createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
if (CPU.empty()) {
CPU = "generic";
+ if (FS.empty())
+ FS = "+v8a";
if (TT.isArm64e())
CPU = "apple-a12";
}
- // Most of the NEON instruction set isn't supported in streaming mode on SME
- // targets, disable NEON unless explicitly requested.
- bool RequestedNEON = FS.contains("neon");
- bool RequestedStreamingSVE = FS.contains("streaming-sve");
- MCSubtargetInfo *STI =
- createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
- if (RequestedStreamingSVE && !RequestedNEON &&
- STI->hasFeature(AArch64::FeatureNEON))
- STI->ToggleFeature(AArch64::FeatureNEON);
- return STI;
+ return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
@@ -243,6 +236,31 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
}
+bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) {
+ const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+ return llvm::any_of(MI, [&](const MCOperand &Op) {
+ return Op.isReg() && FPR128.contains(Op.getReg());
+ });
+}
+
+bool AArch64_MC::isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII) {
+ const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+ const auto &FPR64 = AArch64MCRegisterClasses[AArch64::FPR64RegClassID];
+ const auto &FPR32 = AArch64MCRegisterClasses[AArch64::FPR32RegClassID];
+ const auto &FPR16 = AArch64MCRegisterClasses[AArch64::FPR16RegClassID];
+ const auto &FPR8 = AArch64MCRegisterClasses[AArch64::FPR8RegClassID];
+
+ auto IsFPR = [&](const MCOperand &Op) {
+ if (!Op.isReg())
+ return false;
+ auto Reg = Op.getReg();
+ return FPR128.contains(Reg) || FPR64.contains(Reg) || FPR32.contains(Reg) ||
+ FPR16.contains(Reg) || FPR8.contains(Reg);
+ };
+
+ return llvm::any_of(MI, IsFPR);
+}
+
static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
MCRegisterInfo *X = new MCRegisterInfo();
InitAArch64MCRegisterInfo(X, AArch64::LR);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 66cb7a37a958..049c49796dc6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/DataTypes.h"
#include <memory>
@@ -22,6 +23,7 @@ class formatted_raw_ostream;
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
+class MCInst;
class MCInstrInfo;
class MCInstPrinter;
class MCRegisterInfo;
@@ -33,7 +35,6 @@ class MCTargetStreamer;
class Target;
MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx);
MCAsmBackend *createAArch64leAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
@@ -60,8 +61,16 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
namespace AArch64_MC {
void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+bool isQForm(const MCInst &MI, const MCInstrInfo *MCII);
+bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII);
}
+namespace AArch64 {
+enum OperandType {
+ OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET,
+};
+} // namespace AArch64
+
} // End llvm namespace
// Defines symbolic names for AArch64 registers. This defines a mapping from
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 92552c3d41d5..1a8071ac1b33 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -76,7 +76,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
return;
}
MCSection *Cur = OutStreamer.getCurrentSectionOnly();
- OutStreamer.SwitchSection(Nt);
+ OutStreamer.switchSection(Nt);
// Emit the note header.
OutStreamer.emitValueToAlignment(Align(8).value());
@@ -92,7 +92,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
OutStreamer.emitIntValue(0, 4); // pad
OutStreamer.endSection(Nt);
- OutStreamer.SwitchSection(Cur);
+ OutStreamer.switchSection(Cur);
}
void AArch64TargetStreamer::emitInst(uint32_t Inst) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 0072af4cc16e..46ffa50b3e6e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index b688165d3a7b..820d940c1ed2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -8,6 +8,7 @@
#include "AArch64WinCOFFStreamer.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCWin64EH.h"
@@ -26,14 +27,14 @@ public:
std::unique_ptr<MCObjectWriter> OW)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
- void EmitWinEHHandlerData(SMLoc Loc) override;
- void EmitWindowsUnwindTables() override;
- void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+ void emitWinEHHandlerData(SMLoc Loc) override;
+ void emitWindowsUnwindTables() override;
+ void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
void finishImpl() override;
};
-void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
- MCStreamer::EmitWinEHHandlerData(Loc);
+void AArch64WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
+ MCStreamer::emitWinEHHandlerData(Loc);
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section!
@@ -41,11 +42,11 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
/* HandlerData = */ true);
}
-void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+void AArch64WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
}
-void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
+void AArch64WinCOFFStreamer::emitWindowsUnwindTables() {
if (!getNumWinFrameInfos())
return;
EHStreamer.Emit(*this);
@@ -53,7 +54,7 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
void AArch64WinCOFFStreamer::finishImpl() {
emitFrames(nullptr);
- EmitWindowsUnwindTables();
+ emitWindowsUnwindTables();
MCWinCOFFStreamer::finishImpl();
}
@@ -71,10 +72,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinUnwindCode(unsigned UnwindCode,
WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
if (!CurFrame)
return;
- MCSymbol *Label = S.emitCFILabel();
- auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
+ auto Inst = WinEH::Instruction(UnwindCode, /*Label=*/nullptr, Reg, Offset);
if (InEpilogCFI)
- CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+ CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
else
CurFrame->Instructions.push_back(Inst);
}
@@ -176,7 +176,8 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIPrologEnd() {
MCSymbol *Label = S.emitCFILabel();
CurFrame->PrologEnd = Label;
- WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+ WinEH::Instruction Inst =
+ WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
auto it = CurFrame->Instructions.begin();
CurFrame->Instructions.insert(it, Inst);
}
@@ -198,9 +199,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() {
return;
InEpilogCFI = false;
- MCSymbol *Label = S.emitCFILabel();
- WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
- CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+ WinEH::Instruction Inst =
+ WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
+ CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
CurrentEpilog = nullptr;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 41f2cead4cf8..2744e81f99f1 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -10,14 +10,36 @@
//
//===----------------------------------------------------------------------===//
+def imm_to_tile8 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAB0>", []>;
+def imm_to_tile16 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAH0>", []>;
+def imm_to_tile32 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAS0>", []>;
+def imm_to_tile64 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAD0>", []>;
+def imm_to_tile128 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAQ0>", []>;
+
+def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>;
+def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>;
+def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>;
+def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>;
+def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop
+
+def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
+
//===----------------------------------------------------------------------===//
// SME Outer Products
//===----------------------------------------------------------------------===//
+class sme_outer_product_pseudo<ZPRRegOp zpr_ty>
+ : Pseudo<(outs), (ins i64imm:$tile, PPR3bAny:$pn, PPR3bAny:$pm,
+ zpr_ty:$zn, zpr_ty:$zm), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+}
+
class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
ZPRRegOp zpr_ty, string mnemonic>
: I<(outs za_ty:$ZAda),
- (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+ (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
"", []>,
Sched<[]> {
@@ -34,26 +56,42 @@ class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty,
let Inst{9-5} = Zn;
let Inst{4} = S;
let Inst{3} = 0b0;
+
+ let Constraints = "$ZAda = $_ZAda";
}
-class sme_outer_product_fp32<bit S, string mnemonic>
- : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
- bits<2> ZAda;
- let Inst{1-0} = ZAda;
- let Inst{2} = 0b0;
+multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> {
+ def NAME : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> {
+ bits<2> ZAda;
+ let Inst{1-0} = ZAda;
+ let Inst{2} = 0b0;
+ }
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32>;
+
+ def : Pat<(op imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
+ (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
-class sme_outer_product_fp64<bit S, string mnemonic>
- : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
- bits<3> ZAda;
- let Inst{2-0} = ZAda;
+multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
+ def NAME : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> {
+ bits<3> ZAda;
+ let Inst{2-0} = ZAda;
+ }
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64>;
+
+ def : Pat<(op imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
+ (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
MatrixTileOperand za_ty, ZPRRegOp zpr_ty,
string mnemonic>
: I<(outs za_ty:$ZAda),
- (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
+ (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
"", []>,
Sched<[]> {
@@ -72,26 +110,44 @@ class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz,
let Inst{9-5} = Zn;
let Inst{4} = S;
let Inst{3} = 0b0;
+
+ let Constraints = "$ZAda = $_ZAda";
}
-class sme_int_outer_product_i32<bits<3> opc, string mnemonic>
- : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8,
- mnemonic> {
- bits<2> ZAda;
- let Inst{1-0} = ZAda;
- let Inst{2} = 0b0;
+multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic,
+ SDPatternOperator op> {
+ def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32,
+ ZPR8, mnemonic> {
+ bits<2> ZAda;
+ let Inst{1-0} = ZAda;
+ let Inst{2} = 0b0;
+ }
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8>;
+
+ def : Pat<(op imm0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm),
+ (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
-class sme_int_outer_product_i64<bits<3> opc, string mnemonic>
- : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16,
- mnemonic> {
- bits<3> ZAda;
- let Inst{2-0} = ZAda;
+multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
+ SDPatternOperator op> {
+ def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64,
+ ZPR16, mnemonic> {
+ bits<3> ZAda;
+ let Inst{2-0} = ZAda;
+ }
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+ def : Pat<(op imm0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+ (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>;
}
class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
: I<(outs TileOp32:$ZAda),
- (ins PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm),
+ (ins TileOp32:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm),
mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm",
"", []>,
Sched<[]> {
@@ -109,14 +165,28 @@ class sme_outer_product_widening_inst<bit op, bit S, string mnemonic>
let Inst{4} = S;
let Inst{3-2} = 0b00;
let Inst{1-0} = ZAda;
+
+ let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme_bf16_outer_product<bit S, string mnemonic> {
- def : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+multiclass sme_bf16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+ def NAME : sme_outer_product_widening_inst<0b0, S, mnemonic>;
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+ def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+ (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
-multiclass sme_f16_outer_product<bit S, string mnemonic> {
- def : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+multiclass sme_f16_outer_product<bit S, string mnemonic, SDPatternOperator op> {
+ def NAME : sme_outer_product_widening_inst<0b1, S, mnemonic>;
+
+ def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>;
+
+ def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
+ (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)),
+ (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>;
}
//===----------------------------------------------------------------------===//
@@ -126,7 +196,7 @@ multiclass sme_f16_outer_product<bit S, string mnemonic> {
class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty,
ZPRRegOp zpr_ty, string mnemonic>
: I<(outs tile_ty:$ZAda),
- (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn),
+ (ins tile_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn),
mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn",
"", []>, Sched<[]> {
bits<3> Pm;
@@ -140,6 +210,8 @@ class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty,
let Inst{12-10} = Pn;
let Inst{9-5} = Zn;
let Inst{4-3} = 0b00;
+
+ let Constraints = "$ZAda = $_ZAda";
}
class sme_add_vector_to_tile_u32<bit V, string mnemonic>
@@ -225,6 +297,33 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">;
}
+multiclass sme_mem_ld_ss_patterns<Instruction Inst, SDPatternOperator Load,
+ Operand tile_ty, Operand offset_ty,
+ ComplexPattern addr,
+ ComplexPattern tileslice> {
+ // base, tileslice
+ def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>;
+
+ // reg + reg, tileslice
+ let AddedComplexity = 1 in {
+ def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+ offset_ty:$imm))),
+ (Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>;
+ }
+}
+
+class sme_load_pseudo
+ : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
+ i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+ let mayLoad = 1;
+}
+
multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -264,6 +363,40 @@ multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_ld_ss_aliases<NAME, is_col>;
+
+ // Pseudo instructions for lowering intrinsics, using immediates instead of
+ // tile registers.
+ def _PSEUDO_B : sme_load_pseudo;
+ def _PSEUDO_H : sme_load_pseudo;
+ def _PSEUDO_S : sme_load_pseudo;
+ def _PSEUDO_D : sme_load_pseudo;
+ def _PSEUDO_Q : sme_load_pseudo;
+
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
+ !if(is_col, int_aarch64_sme_ld1b_vert,
+ int_aarch64_sme_ld1b_horiz),
+ sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0,
+ tileslice8>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+ !if(is_col, int_aarch64_sme_ld1h_vert,
+ int_aarch64_sme_ld1h_horiz),
+ imm0_1, imm0_7, am_sve_regreg_lsl1,
+ tileslice16>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+ !if(is_col, int_aarch64_sme_ld1w_vert,
+ int_aarch64_sme_ld1w_horiz),
+ imm0_3, imm0_3, am_sve_regreg_lsl2,
+ tileslice32>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+ !if(is_col, int_aarch64_sme_ld1d_vert,
+ int_aarch64_sme_ld1d_horiz),
+ imm0_7, imm0_1, am_sve_regreg_lsl3,
+ tileslice64>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ !if(is_col, int_aarch64_sme_ld1q_vert,
+ int_aarch64_sme_ld1q_horiz),
+ imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4,
+ tileslice128>;
}
multiclass sme_mem_ld_ss<string mnemonic> {
@@ -310,6 +443,25 @@ multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
}
+multiclass sme_mem_st_ss_patterns<Instruction Inst, SDPatternOperator Store,
+ Operand offset_ty,
+ ComplexPattern imm2tile,
+ ComplexPattern addr,
+ ComplexPattern tileslice> {
+ // base, tileslice
+ def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst $tile, $idx, $imm, $pg, $base, XZR)>;
+
+ // reg + reg, tileslice
+ let AddedComplexity = 1 in {
+ def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ (imm2tile untyped:$tile),
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst $tile, $idx, $imm, $pg, $base, $offset)>;
+ }
+}
+
multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -349,6 +501,32 @@ multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_st_ss_aliases<NAME, is_col>;
+
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _B),
+ !if(is_col, int_aarch64_sme_st1b_vert,
+ int_aarch64_sme_st1b_horiz),
+ imm0_15, imm_to_tile8, am_sve_regreg_lsl0,
+ tileslice8>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _H),
+ !if(is_col, int_aarch64_sme_st1h_vert,
+ int_aarch64_sme_st1h_horiz),
+ imm0_7, imm_to_tile16, am_sve_regreg_lsl1,
+ tileslice16>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _S),
+ !if(is_col, int_aarch64_sme_st1w_vert,
+ int_aarch64_sme_st1w_horiz),
+ imm0_3, imm_to_tile32, am_sve_regreg_lsl2,
+ tileslice32>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _D),
+ !if(is_col, int_aarch64_sme_st1d_vert,
+ int_aarch64_sme_st1d_horiz),
+ imm0_1, imm_to_tile64, am_sve_regreg_lsl3,
+ tileslice64>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _Q),
+ !if(is_col, int_aarch64_sme_st1q_vert,
+ int_aarch64_sme_st1q_horiz),
+ sme_elm_idx0_0, imm_to_tile128,
+ am_sve_regreg_lsl4, tileslice128>;
}
multiclass sme_mem_st_ss<string mnemonic> {
@@ -360,7 +538,7 @@ multiclass sme_mem_st_ss<string mnemonic> {
// SME Save and Restore Array
//===----------------------------------------------------------------------===//
-class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr>
+class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr>
: I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "",
[]>,
Sched<[]> {
@@ -375,33 +553,61 @@ class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr>
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = imm4;
-
- let mayLoad = !not(isStore);
- let mayStore = isStore;
}
-multiclass sme_spill_fill<bit isStore, dag outs, dag ins, string opcodestr> {
- def NAME : sme_spill_fill_inst<isStore, outs, ins, opcodestr>;
-
+let mayStore = 1 in
+class sme_spill_inst<string opcodestr>
+ : sme_spill_fill_base<0b1, (outs),
+ (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
+ sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
+ imm0_15:$offset),
+ opcodestr>;
+let mayLoad = 1 in
+class sme_fill_inst<string opcodestr>
+ : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt),
+ (ins MatrixIndexGPR32Op12_15:$Rv,
+ sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
+ imm0_15:$offset),
+ opcodestr>;
+multiclass sme_spill<string opcodestr> {
+ def NAME : sme_spill_inst<opcodestr>;
def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
(!cast<Instruction>(NAME) MatrixOp:$ZAt,
MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
-}
-
-multiclass sme_spill<string opcodestr> {
- defm NAME : sme_spill_fill<0b1, (outs),
- (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
- sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
- imm0_15:$offset),
- opcodestr>;
+ // base
+ def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
+ (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>;
+ // scalar + immediate (mul vl)
+ let AddedComplexity = 2 in {
+ def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx,
+ (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)),
+ (!cast<Instruction>(NAME) ZA, $idx, 0, $base, $imm4)>;
+ }
}
multiclass sme_fill<string opcodestr> {
- defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt),
- (ins MatrixIndexGPR32Op12_15:$Rv,
- sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
- imm0_15:$offset),
- opcodestr>;
+ def NAME : sme_fill_inst<opcodestr>;
+ def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
+ (!cast<Instruction>(NAME) MatrixOp:$ZAt,
+ MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
+ def NAME # _PSEUDO
+ : Pseudo<(outs),
+ (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4,
+ GPR64sp:$base), []>,
+ Sched<[]> {
+ // Translated to actual instruction in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+ let mayLoad = 1;
+ }
+ // base
+ def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
+ (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>;
+ // scalar + immediate (mul vl)
+ let AddedComplexity = 2 in {
+ def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx,
+ (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)),
+ (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm4, $base)>;
+ }
}
//===----------------------------------------------------------------------===//
@@ -429,8 +635,12 @@ class sme_vector_to_tile_inst<bit Q, bits<2> sz, MatrixTileVectorOperand tile_ty
bit is_col, Operand imm_ty, ZPRRegOp zpr_ty,
string mnemonic>
: sme_vector_to_tile_base<Q, is_col, sz, (outs tile_ty:$ZAd),
- (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
- mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">;
+ (ins tile_ty:$_ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
+ mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">{
+
+ let Constraints = "$ZAd = $_ZAd";
+}
+
multiclass sme_vector_to_tile_aliases<Instruction inst,
MatrixTileVectorOperand tile_ty,
@@ -439,6 +649,30 @@ multiclass sme_vector_to_tile_aliases<Instruction inst,
(inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>;
}
+multiclass sme_vector_to_tile_patterns<Instruction inst, ValueType zpr_vt,
+ ValueType ppr_vt, Operand imm_ty,
+ Operand offset_ty,
+ SDPatternOperator op,
+ ComplexPattern tileslice> {
+ def : Pat<(op imm_ty:$tile, MatrixIndexGPR32Op12_15:$idx,
+ (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)),
+ (inst imm_ty:$tile, $idx, 0, $pg, $zn)>;
+ let AddedComplexity = 1 in {
+ def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+ offset_ty:$imm)),
+ (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)),
+ (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>;
+ }
+}
+
+class sme_mova_insert_pseudo
+ : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
+ i64imm:$imm, PPR3bAny:$pg, ZPRAny:$zn), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+}
+
multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8,
TileVectorOpH8),
@@ -478,6 +712,14 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
let Inst{3-0} = ZAd;
}
+ // Pseudo instructions for lowering intrinsics, using immediates instead of
+ // tile registers.
+ def _PSEUDO_B : sme_mova_insert_pseudo;
+ def _PSEUDO_H : sme_mova_insert_pseudo;
+ def _PSEUDO_S : sme_mova_insert_pseudo;
+ def _PSEUDO_D : sme_mova_insert_pseudo;
+ def _PSEUDO_Q : sme_mova_insert_pseudo;
+
defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _B),
!if(is_col, TileVectorOpV8,
TileVectorOpH8),
@@ -498,6 +740,62 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
!if(is_col, TileVectorOpV128,
TileVectorOpH128),
ZPR128, sme_elm_idx0_0>;
+
+ defvar op = !if(is_col, int_aarch64_sme_write_vert,
+ int_aarch64_sme_write_horiz);
+
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
+ nxv16i8, nxv16i1, sme_elm_idx0_0, imm0_15,
+ op, tileslice8>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+ nxv8i16, nxv8i1, sme_elm_idx0_1, imm0_7,
+ op, tileslice16>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+ nxv8f16, nxv8i1, sme_elm_idx0_1, imm0_7,
+ op, tileslice16>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+ nxv8bf16, nxv8i1, sme_elm_idx0_1, imm0_7,
+ op, tileslice16>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+ nxv4i32, nxv4i1, sme_elm_idx0_3, imm0_3,
+ op, tileslice32>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+ nxv4f32, nxv4i1, sme_elm_idx0_3, imm0_3,
+ op, tileslice32>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+ nxv2i64, nxv2i1, sme_elm_idx0_7, imm0_1,
+ op, tileslice64>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+ nxv2f64, nxv2i1, sme_elm_idx0_7, imm0_1,
+ op, tileslice64>;
+
+ defvar opq = !if(is_col, int_aarch64_sme_writeq_vert,
+ int_aarch64_sme_writeq_horiz);
+
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv16i8, nxv16i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv8i16, nxv8i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv8f16, nxv8i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv8bf16, nxv8i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv4i32, nxv4i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv4f32, nxv4i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv2i64, nxv2i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
+ defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ nxv2f64, nxv2i1, sme_elm_idx0_15,
+ sme_elm_idx0_0, opq, tileslice128>;
}
multiclass sme_vector_to_tile<string mnemonic> {
@@ -526,8 +824,11 @@ class sme_tile_to_vector_inst<bit Q, bits<2> sz, ZPRRegOp zpr_ty,
MatrixTileVectorOperand tile_ty,
bit is_col, Operand imm_ty, string mnemonic>
: sme_tile_to_vector_base<Q, is_col, sz, (outs zpr_ty:$Zd),
- (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
- mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]">;
+ (ins zpr_ty:$_Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
+ mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]"> {
+
+ let Constraints = "$Zd = $_Zd";
+}
multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
MatrixTileVectorOperand tile_ty,
@@ -536,6 +837,23 @@ multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
(inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>;
}
+multiclass sme_tile_to_vector_patterns<Instruction inst, ValueType zpr_vt,
+ ValueType ppr_vt, Operand offset_ty,
+ ComplexPattern imm2tile,
+ ComplexPattern tileslice,
+ SDPatternOperator op> {
+ def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg),
+ (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx)),
+ (inst $passthru, $pg, $tile, $idx, 0)>;
+ let AddedComplexity = 1 in {
+ def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg),
+ (imm2tile untyped:$tile),
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+ offset_ty:$imm)))),
+ (inst $passthru, $pg, $tile, $idx, $imm)>;
+ }
+}
+
multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8,
TileVectorOpH8),
@@ -589,6 +907,62 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _Q), ZPR128,
!if(is_col, TileVectorOpV128,
TileVectorOpH128), sme_elm_idx0_0>;
+
+ defvar op = !if(is_col, int_aarch64_sme_read_vert,
+ int_aarch64_sme_read_horiz);
+
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _B),
+ nxv16i8, nxv16i1, imm0_15,
+ imm_to_tile8, tileslice8, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+ nxv8i16, nxv8i1, imm0_7,
+ imm_to_tile16, tileslice16, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+ nxv8f16, nxv8i1, imm0_7,
+ imm_to_tile16, tileslice16, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H),
+ nxv8bf16, nxv8i1, imm0_7,
+ imm_to_tile16, tileslice16, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S),
+ nxv4i32, nxv4i1, imm0_3,
+ imm_to_tile32, tileslice32, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S),
+ nxv4f32, nxv4i1, imm0_3,
+ imm_to_tile32, tileslice32, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D),
+ nxv2i64, nxv2i1, imm0_1,
+ imm_to_tile64, tileslice64, op>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D),
+ nxv2f64, nxv2i1, imm0_1,
+ imm_to_tile64, tileslice64, op>;
+
+ defvar opq = !if(is_col, int_aarch64_sme_readq_vert,
+ int_aarch64_sme_readq_horiz);
+
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv16i8, nxv16i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv8i16, nxv8i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv8f16, nxv8i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv8bf16, nxv8i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv4i32, nxv4i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv4f32, nxv4i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv2i64, nxv2i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
+ defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q),
+ nxv2f64, nxv2i1, sme_elm_idx0_0,
+ imm_to_tile128, tileslice128, opq>;
}
multiclass sme_tile_to_vector<string mnemonic> {
@@ -600,8 +974,11 @@ multiclass sme_tile_to_vector<string mnemonic> {
// SME Zero
//===----------------------------------------------------------------------===//
+// NOTE: This definition isn't really correct because there are outputs, i.e.
+// the tile registers being zeroed. We fix this up in a custom inserter that
+// marks the appropriate registers as being implicitly defined.
class sme_zero_inst<string mnemonic>
- : I<(outs MatrixTileList:$imm), (ins),
+ : I<(outs), (ins MatrixTileList:$imm),
mnemonic, "\t$imm", "", []>, Sched<[]> {
bits<8> imm;
let Inst{31-8} = 0b110000000000100000000000;
@@ -626,6 +1003,15 @@ multiclass sme_zero<string mnemonic> {
def : InstAlias<"zero\t\\{za0.s,za1.s,za3.s\\}", (!cast<Instruction>(NAME) 0b10111011), 1>;
def : InstAlias<"zero\t\\{za0.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11011101), 1>;
def : InstAlias<"zero\t\\{za1.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11101110), 1>;
+
+ def NAME # _PSEUDO : Pseudo<(outs), (ins i64imm:$tilelist), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+ }
+
+ def : Pat<(int_aarch64_sme_zero imm:$imm),
+ (!cast<Instruction>(NAME # _PSEUDO) imm:$imm)>;
}
//===----------------------------------------------------------------------===//
@@ -651,6 +1037,15 @@ class sve2_int_perm_revd<string asm>
let ElementSize = ZPR128.ElementSize;
}
+multiclass sve2_int_perm_revd<string asm, SDPatternOperator op> {
+ def NAME : sve2_int_perm_revd<asm>;
+
+ def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME)>;
+}
+
class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
: I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd),
asm, "\t$Zd, $Zn, $Zm", "", []>,
@@ -672,11 +1067,16 @@ class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
let ElementSize = zpr_ty.ElementSize;
}
-multiclass sve2_clamp<string asm, bit U> {
+multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> {
def _B : sve2_clamp<asm, 0b00, U, ZPR8>;
def _H : sve2_clamp<asm, 0b01, U, ZPR16>;
def _S : sve2_clamp<asm, 0b10, U, ZPR32>;
def _D : sve2_clamp<asm, 0b11, U, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
@@ -699,7 +1099,7 @@ class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
let Inst{3-0} = Pd;
}
-multiclass sve2_int_perm_sel_p<string asm> {
+multiclass sve2_int_perm_sel_p<string asm, SDPatternOperator op> {
def _B : sve2_int_perm_sel_p<asm, PPR8, sme_elm_idx0_15> {
bits<4> imm;
let Inst{23-22} = imm{3-2};
@@ -723,4 +1123,32 @@ multiclass sve2_int_perm_sel_p<string asm> {
let Inst{22} = 0b1;
let Inst{20-18} = 0b000;
}
+
+ def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+ MatrixIndexGPR32Op12_15:$idx)),
+ (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, 0)>;
+ def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+ MatrixIndexGPR32Op12_15:$idx)),
+ (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, 0)>;
+ def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+ MatrixIndexGPR32Op12_15:$idx)),
+ (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, 0)>;
+ def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+ MatrixIndexGPR32Op12_15:$idx)),
+ (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, 0)>;
+
+ let AddedComplexity = 1 in {
+ def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+ (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))),
+ (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, $imm)>;
+ def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+ (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))),
+ (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, $imm)>;
+ def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+ (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))),
+ (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, $imm)>;
+ def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+ (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))),
+ (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, $imm)>;
+ }
}
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9d4bdbe5d053..3631536a32b9 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -199,6 +199,11 @@ def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", [
def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
+def SVECpyDupImm8Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>;
+def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>;
+def SVECpyDupImm32Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i32>", []>;
+def SVECpyDupImm64Pat : ComplexPattern<i64, 2, "SelectSVECpyDupImm<MVT::i64>", []>;
+
def SVELogicalImm8Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8>", []>;
def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>", []>;
def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
@@ -209,14 +214,6 @@ def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16
def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>;
-def SVE8BitLslImm32 : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
-def SVE8BitLslImm64 : ComplexPattern<i64, 2, "SelectSVE8BitLslImm", [imm]>;
-class SVE8BitLslImm<ValueType ty> {
- ComplexPattern Pat = !cond(
- !eq(ty, i32): SVE8BitLslImm32,
- !eq(ty, i64): SVE8BitLslImm64);
-}
-
def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>;
def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>;
@@ -234,6 +231,8 @@ def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []
def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
def SVEShiftImmR64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<1, 64, true>", []>;
+def SVEShiftSplatImmR : ComplexPattern<iAny, 1, "SelectSVEShiftSplatImmR", []>;
+
def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
@@ -335,9 +334,14 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
-let Predicates = [HasSVEorStreamingSVE] in {
+let Predicates = [HasSVEorSME] in {
defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
+
+ def : Pat<(nxv16i1 immAllOnesV), (PTRUE_B 31)>;
+ def : Pat<(nxv8i1 immAllOnesV), (PTRUE_H 31)>;
+ def : Pat<(nxv4i1 immAllOnesV), (PTRUE_S 31)>;
+ def : Pat<(nxv2i1 immAllOnesV), (PTRUE_D 31)>;
}
//===----------------------------------------------------------------------===//
@@ -370,24 +374,27 @@ class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType
: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
(inst $Op3, $Op1, $Op2)>;
-class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
- ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
- (inst $Op1, i32:$imm, i32:$shift)>;
+multiclass SVE_1_Op_PassthruUndef_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+ ValueType vts, Instruction inst>{
+ def : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), (vtd undef))),
+ (inst (IMPLICIT_DEF), $Op1, $Op2)>;
+ def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+ (inst $Op3, $Op1, $Op2)>;
+}
class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
+ : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm, i32:$shift)))))),
(inst $Op1, i32:$imm, i32:$shift)>;
class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+ : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
+ : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i64:$imm)))))),
(inst $Op1, i64:$imm)>;
class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
@@ -489,20 +496,20 @@ multiclass SVE_InReg_Extend_PassthruUndef<ValueType vt, SDPatternOperator op, Va
class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
ComplexPattern cast, Instruction inst>
-: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
(inst $Pg, $Rn, i32:$imm)>;
class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
ComplexPattern cast, Instruction inst>
-: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))),
(inst $Rn, i32:$imm)>;
class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op,
ValueType pt, ValueType it,
FPImmLeaf immL, int imm,
Instruction inst>
-: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (AArch64dup (it immL))))),
+: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (splat_vector (it immL))))),
(inst $Pg, $Zs1, imm)>;
class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
@@ -510,9 +517,33 @@ class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
FPImmLeaf immL, int imm,
Instruction inst>
: Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Zs1, (SVEDup0)),
- (vt (AArch64dup (it immL))))),
+ (vt (splat_vector (it immL))))),
(inst $Pg, $Zs1, imm)>;
+// Used to re-order the operands of BSP when lowering to BSL. BSP has the order:
+// mask, in1, in2 whereas BSL for SVE2 has them ordered in1, in2, mask
+class SVE_3_Op_BSP_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+ (inst $Op2, $Op3, $Op1)>;
+
+class SVE_Shift_Add_All_Active_Pat<ValueType vtd, SDPatternOperator op, ValueType pt,
+ ValueType vt1, ValueType vt2, ValueType vt3,
+ Instruction inst>
+: Pat<(vtd (add vt1:$Op1, (op (pt (SVEAllActive)), vt2:$Op2, vt3:$Op3))),
+ (inst $Op1, $Op2, $Op3)>;
+
+//===----------------------------------------------------------------------===//
+// SVE pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+// Matches either an intrinsic, or a predicated operation with an all active predicate
+class EitherVSelectOrPassthruPatFrags<SDPatternOperator intrinsic, SDPatternOperator sdnode>
+: PatFrags<(ops node:$Pg, node:$Op1, node:$Op2), [
+ (intrinsic node:$Pg, node:$Op1, node:$Op2),
+ (vselect node:$Pg, (sdnode (SVEAllActive), node:$Op1, node:$Op2), node:$Op1),
+ ]>;
+
//
// Pseudo -> Instruction mappings
//
@@ -612,10 +643,11 @@ class sve_int_pfalse<bits<6> opc, string asm>
multiclass sve_int_pfalse<bits<6> opc, string asm> {
def NAME : sve_int_pfalse<opc, asm>;
- def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv16i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv8i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv4i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv2i1 immAllZerosV), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>;
}
class sve_int_ptest<bits<6> opc, string asm>
@@ -885,6 +917,8 @@ class sve_int_count<bits<3> opc, string asm>
let Inst{10} = opc{0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rd;
+
+ let isReMaterializable = 1;
}
multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
@@ -965,7 +999,7 @@ class sve_int_pred_pattern_a<bits<3> opc, string asm>
multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
SDPatternOperator op,
SDPatternOperator opcnt> {
- let Predicates = [HasSVEorStreamingSVE] in {
+ let Predicates = [HasSVEorSME] in {
def NAME : sve_int_pred_pattern_a<opc, asm>;
def : InstAlias<asm # "\t$Rdn, $pattern",
@@ -974,7 +1008,7 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
(!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
}
- let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL] in {
+ let Predicates = [HasSVEorSME, UseScalarIncVL] in {
def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))),
(!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>;
@@ -1170,28 +1204,45 @@ multiclass sve_int_perm_dup_i<string asm> {
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
// Duplicate extracted element of vector into all vector elements
- def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
+ def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
(!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>;
- def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ def : Pat<(nxv8i16 (splat_vector (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
(!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
- def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ def : Pat<(nxv4i32 (splat_vector (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
(!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
- def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ def : Pat<(nxv2i64 (splat_vector (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
(!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
- def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ def : Pat<(nxv8f16 (splat_vector (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
(!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
- def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+ def : Pat<(nxv8bf16 (splat_vector (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
(!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
- def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ def : Pat<(nxv4f16 (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
(!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
- def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ def : Pat<(nxv2f16 (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
(!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
- def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+ def : Pat<(nxv4f32 (splat_vector (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
(!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
- def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ def : Pat<(nxv2f32 (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
(!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
- def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+ def : Pat<(nxv2f64 (splat_vector (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
(!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+
+ def : Pat<(nxv16i8 (AArch64duplane128 nxv16i8:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv8i16 (AArch64duplane128 nxv8i16:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv4i32 (AArch64duplane128 nxv4i32:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv2i64 (AArch64duplane128 nxv2i64:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv8f16 (AArch64duplane128 nxv8f16:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv4f32 (AArch64duplane128 nxv4f32:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv2f64 (AArch64duplane128 nxv2f64:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
+ def : Pat<(nxv8bf16 (AArch64duplane128 nxv8bf16:$Op1, i64:$imm)),
+ (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
}
class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty,
@@ -1631,6 +1682,7 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_Pat<nxv1i1, op, nxv1i1, nxv1i1, nxv1i1, !cast<Instruction>(NAME)>;
def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
!cast<Instruction>(NAME), PTRUE_B>;
def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
@@ -1743,7 +1795,7 @@ multiclass sve_int_dup_mask_imm<string asm> {
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
- def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))),
+ def : Pat<(nxv2i64 (splat_vector (i64 logical_imm64:$imm))),
(!cast<Instruction>(NAME) logical_imm64:$imm)>;
}
@@ -2478,7 +2530,7 @@ multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> {
// SVE Stack Allocation Group
//===----------------------------------------------------------------------===//
-class sve_int_arith_vl<bit opc, string asm>
+class sve_int_arith_vl<bit opc, string asm, bit streaming_sve = 0b0>
: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
asm, "\t$Rd, $Rn, $imm6",
"",
@@ -2490,12 +2542,13 @@ class sve_int_arith_vl<bit opc, string asm>
let Inst{22} = opc;
let Inst{21} = 0b1;
let Inst{20-16} = Rn;
- let Inst{15-11} = 0b01010;
+ let Inst{15-12} = 0b0101;
+ let Inst{11} = streaming_sve;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
}
-class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
+class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b0>
: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
asm, "\t$Rd, $imm6",
"",
@@ -2506,9 +2559,12 @@ class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
let Inst{22} = op;
let Inst{21} = 0b1;
let Inst{20-16} = opc2{4-0};
- let Inst{15-11} = 0b01010;
+ let Inst{15-12} = 0b0101;
+ let Inst{11} = streaming_sve;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
+
+ let isReMaterializable = 1;
}
//===----------------------------------------------------------------------===//
@@ -2589,8 +2645,8 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
SDPatternOperator int_op,
SDPatternOperator ir_op, ValueType vt1,
ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
- def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
-
+ def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>,
+ SVEPseudo2Instr<NAME, 1>;
// convert vt1 to a packed type for the intrinsic patterns
defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
!eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
@@ -2604,8 +2660,11 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
1 : vt3);
def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>;
-
def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+
+ def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+
+ defm : SVE_1_Op_PassthruUndef_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
}
multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
@@ -2614,7 +2673,8 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
SDPatternOperator int_op,
SDPatternOperator ir_op, ValueType vt1,
ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
- def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+ def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>,
+ SVEPseudo2Instr<NAME, 1>;
// convert vt1 to a packed type for the intrinsic patterns
defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
@@ -2623,8 +2683,11 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
1 : vt1);
def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
-
def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+
+ def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+
+ defm : SVE_1_Op_PassthruUndef_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
}
multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
@@ -2726,11 +2789,19 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
- def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
- def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
- def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_log<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op,
+ DestructiveInstTypeEnum flags> {
+ let DestructiveInstType = flags in {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -3756,7 +3827,8 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
}
multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
- SDPatternOperator op> {
+ SDPatternOperator op,
+ SDPatternOperator shift_op = null_frag> {
def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
@@ -3773,6 +3845,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_Shift_Add_All_Active_Pat<nxv16i8, shift_op, nxv16i1, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
+ def : SVE_Shift_Add_All_Active_Pat<nxv8i16, shift_op, nxv8i1, nxv8i16, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
+ def : SVE_Shift_Add_All_Active_Pat<nxv4i32, shift_op, nxv4i1, nxv4i32, nxv4i32, i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_Shift_Add_All_Active_Pat<nxv2i64, shift_op, nxv2i1, nxv2i64, nxv2i64, i32, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
@@ -4331,18 +4408,6 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
- def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
- def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
- def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
-
- def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
-}
-
class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
@@ -4458,7 +4523,8 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op,
+ SDPatternOperator ir_op = null_frag> {
def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
@@ -4472,6 +4538,12 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperato
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
+
+
+ def : SVE_3_Op_BSP_Pat<nxv16i8, ir_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_BSP_Pat<nxv8i16, ir_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_BSP_Pat<nxv4i32, ir_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
+ def : SVE_3_Op_BSP_Pat<nxv2i64, ir_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
}
class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
@@ -4578,29 +4650,28 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
}
multiclass sve_int_dup_imm_pred_merge_inst<
- bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
- ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+ bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm,
+ ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> {
let Constraints = "$Zd = $_Zd" in
def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m",
(ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
- def : Pat<(intty
- (vselect predty:$Pg,
- (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
- intty:$Zd)),
- (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
+ def : Pat<(vselect predty:$Pg,
+ (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))),
+ ZPR:$Zd),
+ (!cast<Instruction>(NAME) $Zd, $Pg, $imm, $shift)>;
}
multiclass sve_int_dup_imm_pred_merge<string asm> {
- defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
- i32, cpy_imm8_opt_lsl_i8>;
- defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
- i32, cpy_imm8_opt_lsl_i16>;
- defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
- i32, cpy_imm8_opt_lsl_i32>;
- defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
- i64, cpy_imm8_opt_lsl_i64>;
+ defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
+ nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
+ defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
+ nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>;
+ defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32,
+ nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>;
+ defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64,
+ nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
@@ -4608,11 +4679,24 @@ multiclass sve_int_dup_imm_pred_merge<string asm> {
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>;
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f16 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>;
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f16 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f32 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>;
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f32 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
+ def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f64 ZPR:$Zd)),
+ (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
}
multiclass sve_int_dup_imm_pred_zero_inst<
- bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
- ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+ bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm,
+ ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> {
def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
(ins PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
@@ -4623,22 +4707,21 @@ multiclass sve_int_dup_imm_pred_zero_inst<
(!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
(!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
- def : Pat<(intty
- (vselect predty:$Pg,
- (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
- (intty (AArch64dup (scalarty 0))))),
- (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
+ def : Pat<(vselect predty:$Pg,
+ (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))),
+ (intty (splat_vector (scalarty 0)))),
+ (!cast<Instruction>(NAME) $Pg, $imm, $shift)>;
}
multiclass sve_int_dup_imm_pred_zero<string asm> {
- defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
- i32, cpy_imm8_opt_lsl_i8>;
- defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
- i32, cpy_imm8_opt_lsl_i16>;
- defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
- i32, cpy_imm8_opt_lsl_i32>;
- defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
- i64, cpy_imm8_opt_lsl_i64>;
+ defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
+ nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
+ defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
+ nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>;
+ defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32,
+ nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>;
+ defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64,
+ nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
}
//===----------------------------------------------------------------------===//
@@ -4690,6 +4773,10 @@ multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt
(cmp $Op1, $Op2)>;
def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
(cmp $Op1, $Op2)>;
+ def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op1, (SVEDup0), cc))),
+ (cmp $Pg, $Op1)>;
+ def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), (SVEDup0), intvt:$Op1, invcc))),
+ (cmp $Pg, $Op1)>;
}
multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
@@ -4761,14 +4848,26 @@ multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
ValueType predvt, ValueType intvt,
Operand immtype, Instruction cmp> {
def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
- (intvt ZPR:$Zs1),
- (intvt (AArch64dup (immtype:$imm))),
- cc)),
+ (intvt ZPR:$Zs1),
+ (intvt (splat_vector (immtype:$imm))),
+ cc)),
(cmp $Pg, $Zs1, immtype:$imm)>;
def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
- (intvt (AArch64dup (immtype:$imm))),
- (intvt ZPR:$Zs1),
- commuted_cc)),
+ (intvt (splat_vector (immtype:$imm))),
+ (intvt ZPR:$Zs1),
+ commuted_cc)),
+ (cmp $Pg, $Zs1, immtype:$imm)>;
+ def : Pat<(predvt (and predvt:$Pg,
+ (AArch64setcc_z (predvt (AArch64ptrue 31)),
+ (intvt ZPR:$Zs1),
+ (intvt (splat_vector (immtype:$imm))),
+ cc))),
+ (cmp $Pg, $Zs1, immtype:$imm)>;
+ def : Pat<(predvt (and predvt:$Pg,
+ (AArch64setcc_z (predvt (AArch64ptrue 31)),
+ (intvt (splat_vector (immtype:$imm))),
+ (intvt ZPR:$Zs1),
+ commuted_cc))),
(cmp $Pg, $Zs1, immtype:$imm)>;
}
@@ -5148,6 +5247,8 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{15-10} = 0b010000;
let Inst{9-5} = imm5;
let Inst{4-0} = Zd;
+
+ let isReMaterializable = 1;
}
multiclass sve_int_index_ii<string asm> {
@@ -5166,13 +5267,13 @@ multiclass sve_int_index_ii<string asm> {
(!cast<Instruction>(NAME # "_D") (i64 0), simm5_64b:$imm5b)>;
// add(step_vector(step), dup(X)) -> index(X, step).
- def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+ def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
- def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+ def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>;
- def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+ def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
}
@@ -5211,35 +5312,35 @@ multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperat
(!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
// add(step_vector(step), dup(X)) -> index(X, step).
- def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+ def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
- def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+ def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
- def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+ def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") $imm))>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") $imm))>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
// mul(step_vector(1), dup(Y)) -> index(0, Y).
- def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
- def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
- def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
- def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+ def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
(!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
// add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
- def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))),
+ def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
- def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))),
+ def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
- def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))),
+ def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
- def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))),
+ def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
}
@@ -5267,13 +5368,13 @@ multiclass sve_int_index_ri<string asm> {
def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
// add(step_vector(step), dup(X)) -> index(X, step).
- def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_B") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
- def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_H") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>;
- def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),
+ def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
(!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
(!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
}
@@ -5301,25 +5402,25 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> {
def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
// add(step_vector(step), dup(X)) -> index(X, step).
- def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
- def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>;
- def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") $imm))>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
(!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") $imm))>;
- def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
(!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
// add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
- def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
- def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
- def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))),
+ def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
(!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
- def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))),
+ def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
(!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>;
}
@@ -5972,25 +6073,25 @@ multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
SDPatternOperator op,
RegisterOperand zprext,
ValueType vt> {
- def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>;
+ def _SCALED : sve_mem_sst_sv2<msz, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+ (!cast<Instruction>(NAME # _SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt),
- (!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+ (!cast<Instruction>(NAME # _SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm,
SDPatternOperator op,
ValueType vt> {
- def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
+ def NAME : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+ (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
- (!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ (!cast<Instruction>(NAME) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
@@ -8433,6 +8534,7 @@ def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", [
def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>;
def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>;
def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>;
+def am_sve_regreg_lsl4 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<4>", []>;
// Predicated pseudo floating point two operand instructions.
multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 4a24162540a5..ccb34f367338 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -305,8 +305,7 @@ bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
// ..where the value stored comes from a vector extract..
auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
- if (!IntrI ||
- IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_extract)
return false;
// ..that is extracting from index 0..
@@ -365,8 +364,7 @@ bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
// ..whose operand is a vector_insert..
auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
- if (!IntrI ||
- IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_insert)
return false;
// ..that is inserting into index zero of an undef vector..
@@ -451,8 +449,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
continue;
switch (F.getIntrinsicID()) {
- case Intrinsic::experimental_vector_extract:
- case Intrinsic::experimental_vector_insert:
+ case Intrinsic::vector_extract:
+ case Intrinsic::vector_insert:
case Intrinsic::aarch64_sve_ptrue:
for (User *U : F.users())
Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 5906a5d6b50b..71303611265c 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -634,7 +634,8 @@ namespace AArch64SysReg {
FeatureBitset FeaturesRequired;
bool haveFeatures(FeatureBitset ActiveFeatures) const {
- return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ return ActiveFeatures[llvm::AArch64::FeatureAll] ||
+ (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
}
};